From 23c3a059284189ad7fc2cdb9f2943a3076e59593 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Fri, 24 May 2024 01:53:15 +0300
Subject: [PATCH 01/39] refactor: split `LlamaChat` implementation into smaller
 functions

---
 src/evaluator/LlamaChat/LlamaChat.ts | 1607 ++++++++++++++++----------
 src/evaluator/LlamaModel.ts          |    7 +-
 2 files changed, 971 insertions(+), 643 deletions(-)
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 2e7a1853..4d03b91b 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -8,7 +8,7 @@ import {removeNullFields} from "../../utils/removeNullFields.js";
 import {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
 import {LlamaText} from "../../utils/LlamaText.js";
 import {StopGenerationDetector} from "../../utils/StopGenerationDetector.js";
-import {QueuedTokenReleaseLock, TokenStreamRegulator} from "../../utils/TokenStreamRegulator.js";
+import {QueuedTokenRelease, QueuedTokenReleaseLock, TokenStreamRegulator} from "../../utils/TokenStreamRegulator.js";
 import {EvaluationPriority} from "../LlamaContext/types.js";
 import {UNKNOWN_UNICODE_CHAR} from "../../consts.js";
 import {getQueuedTokensBeforeStopTrigger} from "../../utils/getQueuedTokensBeforeStopTrigger.js";
@@ -174,6 +174,7 @@ const defaultContextShiftOptions: Required<LLamaChatContextShiftOptions> = {
     strategy: "eraseFirstResponseAndKeepFirstSystem",
     lastEvaluationMetadata: null
 };
+const defaultRepeatPenaltyLastTokens = 64;
 
 
 export class LlamaChat {
@@ -260,6 +261,14 @@ export class LlamaChat {
     }
 
     public async generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(
+        history: ChatHistoryItem[],
+        options: LLamaChatGenerateResponseOptions<Functions> = {}
+    ): Promise<LlamaChatResponse<Functions>> {
+        return this._generateResponse(history, options);
+    }
+
+    /** @internal */
+    private async _generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(
         history: ChatHistoryItem[],
         {
             onToken,
@@ -285,673 +294,106 @@ export class LlamaChat {
             } = {}
         }: LLamaChatGenerateResponseOptions<Functions> = {}
     ): Promise<LlamaChatResponse<Functions>> {
-        const functionsEnabled = (functions != null && Object.keys(functions).length > 0);
-
-        if (grammar != null && functionsEnabled)
-            throw new Error("Using both grammar and functions is not supported yet");
-
-        if (signal?.aborted)
-            throw signal.reason;
-
-        if (this._sequence == null)
-            throw new DisposedError();
-
-        let resolvedHistory = this._sequence.isLoadedToMemory
-            ? history.slice()
-            : history.map(removeRawFromHistoryItem);
-
-        if (resolvedHistory.length === 0 || resolvedHistory[resolvedHistory.length - 1].type !== "model")
-            resolvedHistory.push({
-                type: "model",
-                response: []
-            });
-
-        const model = this._sequence.model;
-        const context = this._sequence.context;
-        const resolvedContextShift = {
-            ...defaultContextShiftOptions,
-            ...removeNullFields(contextShift)
-        };
-        const {
-            lastTokens: repeatPenaltyLastTokens = 64,
-            punishTokensFilter,
-            penalizeNewLine,
-            penalty,
-            frequencyPenalty,
-            presencePenalty
-        }: LLamaContextualRepeatPenalty = repeatPenalty === false
-            ? {lastTokens: 0}
-            : repeatPenalty;
-        const lastModelResponse = getLastTextModelResponseFromChatHistory(resolvedHistory);
-
-        const res: Token[] = [];
-        const pendingTokens: Token[] = [];
-        let ignoredStartTextTokens: Token[] = [];
-        const functionCallTokens: Token[] = [];
-        const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
-        const grammarEvaluationState = grammar != null
-            ? new LlamaGrammarEvaluationState({grammar})
-            : undefined;
-        let functionsGrammar = functionsEnabled
-            ? new FunctionCallGrammar(model._llama, functions as NonNullable<Functions>, this._chatWrapper, false)
-            : undefined;
-        let functionsEvaluationState = (functionsEnabled && functionsGrammar != null)
-            ? new LlamaGrammarEvaluationState({
-                grammar: functionsGrammar
-            })
-            : undefined;
-        const streamRegulator = new TokenStreamRegulator();
-        const stopGenerationDetector = new StopGenerationDetector();
-        const customStopGenerationTriggersDetector = new StopGenerationDetector();
-        const functionSyntaxStartDetector = new StopGenerationDetector();
-        const functionSyntaxEndDetector = new StopGenerationDetector();
-        const disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
-        const ignoreStartTextDetector = new StopGenerationDetector();
-        const locksToReleaseOnValidGeneration: QueuedTokenReleaseLock[] = [];
-        const functionCallTokenSyntaxLocks: QueuedTokenReleaseLock[] = [];
-
-        let generatedTokens = 0;
-        let isFirstEvaluation = true;
-        let inFunctionEvaluationMode = false;
-        let initiallyEngagedFunctionMode = false;
-        let lastContextWindowHistory: ChatHistoryItem[] = resolvedHistory;
-        let lastHistoryCompressionMetadata: object | null | undefined = resolvedContextShift.lastEvaluationMetadata;
-
-        const ensureNotAborted = () => {
-            if (signal?.aborted && (!stopOnAbortSignal || res.length === 0))
-                throw signal.reason;
-
-            if (this._sequence == null)
-                throw new DisposedError();
-        };
-
-        const getPenaltyTokens = () => {
-            if (this._sequence == null)
-                throw new DisposedError();
-
-            let punishTokens = res.slice(-repeatPenaltyLastTokens);
-
-            if (punishTokensFilter != null)
-                punishTokens = punishTokensFilter(punishTokens);
-
-            if (penalizeNewLine == null || !penalizeNewLine) {
-                const nlToken = model.tokens.nl;
-
-                if (nlToken != null)
-                    punishTokens = punishTokens.filter(token => token !== nlToken);
-            }
-
-            return punishTokens;
-        };
-
-        const getResolvedHistoryWithCurrentModelResponse = () => {
-            if (res.length === 0)
-                return resolvedHistory;
-
-            let modelResponse = model.detokenize(res);
-
-            if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
-                modelResponse = modelResponse.trimEnd();
-
-            if (modelResponse === "")
-                return resolvedHistory;
-
-            return setLastModelTextResponseInChatHistory(
-                resolvedHistory,
-                lastModelResponse + modelResponse
-            );
-        };
-
-        const removeFoundStartIgnoreTextsFromPendingTokens = () => {
-            if (res.length === 0 && pendingTokens.length > 0) {
-                ignoreStartTextDetector.clearInProgressStops();
-                ignoreStartTextDetector.clearTriggeredStops();
-
-                let mostExhaustiveTriggeredStops: ReturnType<typeof ignoreStartTextDetector.getTriggeredStops> | null = null;
-
-                for (let i = 0; i < pendingTokens.length; i++) {
-                    ignoreStartTextDetector.recordGeneration({
-                        text: model.detokenize([pendingTokens[i]]),
-                        tokens: [pendingTokens[i]],
-                        startNewChecks: i === 0
-                    });
-
-                    if (ignoreStartTextDetector.hasTriggeredStops) {
-                        mostExhaustiveTriggeredStops = ignoreStartTextDetector.getTriggeredStops();
-                        ignoreStartTextDetector.clearTriggeredStops();
-                    } else if (!ignoreStartTextDetector.hasInProgressStops)
-                        break;
-                }
-
-                if (mostExhaustiveTriggeredStops != null) {
-                    const [mostExhaustiveTriggeredStop] = mostExhaustiveTriggeredStops;
-
-                    if (mostExhaustiveTriggeredStop != null) {
-                        ignoredStartTextTokens = mostExhaustiveTriggeredStop.stopTrigger
-                            .map((stopTrigger) => {
-                                if (typeof stopTrigger === "string")
-                                    return model.tokenize(stopTrigger, false, "trimLeadingSpace");
-                                else
-                                    return [stopTrigger];
-                            })
-                            .flat(1);
-
-                        const newPendingTokens = mostExhaustiveTriggeredStop.remainingGenerations
-                            .map((generation) => {
-                                if (typeof generation === "string")
-                                    return model.tokenize(generation, false, "trimLeadingSpace");
-                                else
-                                    return generation;
-                            })
-                            .flat(1);
-                        pendingTokens.length = 0;
-                        pendingTokens.push(...newPendingTokens);
-                    }
+        const generateResponseState = new GenerateResponseState<Functions>(
+            this,
+            this._chatWrapper,
+            history,
+            {
+                onToken,
+                signal,
+                stopOnAbortSignal,
+                maxTokens,
+                temperature,
+                minP,
+                topK,
+                topP,
+                grammar: grammar as never,
+                trimWhitespaceSuffix,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                functions,
+                documentFunctionParams,
+                contextShift,
+                customStopTriggers,
+                lastEvaluationContextWindow: {
+                    history: lastEvaluationContextWindowHistory,
+                    minimumOverlapPercentageToPreventContextShift
                 }
             }
-        };
-
-        if (customStopTriggers != null)
-            StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
-                .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
+        );
 
-        if (grammar != null)
-            StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
-                .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
+        try {
+            generateResponseState.ensureLastHistoryItemIsModel();
 
-        if (functions != null && Object.keys(functions).length > 0)
-            functionSyntaxStartDetector.addStopTrigger([this._chatWrapper.settings.functions.call.prefix]);
+            // eslint-disable-next-line no-constant-condition
+            while (true) {
+                generateResponseState.startTokenLoop();
+                await generateResponseState.loadContextWindow();
 
-        // eslint-disable-next-line no-constant-condition
-        while (true) {
-            ensureNotAborted();
+                if (generateResponseState.generatedTokens === 0) {
+                    generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
+                    generateResponseState.addFunctionSyntaxEndTriggersFromFunctionsGrammar();
 
-            let shouldContextShift = false;
-            const queuedChunkTokens = streamRegulator.getAllQueuedChunkTokens();
-            const {
-                history: contextWindowHistory,
-                stopGenerationTriggers,
-                tokens: contextWindowTokens,
-                newResolvedHistory,
-                newHistoryCompressionMetadata,
-                ignoreStartText,
-                functionCallInitiallyEngaged,
-                disengageInitiallyEngagedFunctionCall
-            } = await getContextWindow({
-                resolvedHistory: getResolvedHistoryWithCurrentModelResponse(),
-                resolvedContextShift,
-                lastHistoryCompressionMetadata,
-                pendingTokensCount: ignoredStartTextTokens.length + pendingTokens.length + queuedChunkTokens.length,
-                isFirstEvaluation,
-                chatWrapper: this._chatWrapper,
-                lastEvaluationContextWindowHistory,
-                minimumOverlapPercentageToPreventContextShift,
-                sequence: this._sequence,
-                minFreeContextTokens: 1,
-                functions: functionsEnabled ? functions : undefined,
-                documentFunctionParams
-            });
-            ensureNotAborted();
-
-            if (generatedTokens === 0) {
-                StopGenerationDetector.resolveStopTriggers(ignoreStartText, model.tokenizer)
-                    .map((stopTrigger) => ignoreStartTextDetector.addStopTrigger(stopTrigger));
-
-                if (functionsEnabled) {
-                    initiallyEngagedFunctionMode = functionCallInitiallyEngaged;
-                    StopGenerationDetector.resolveStopTriggers(disengageInitiallyEngagedFunctionCall, model.tokenizer)
-                        .map((stopTrigger) => disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
-
-                    if (initiallyEngagedFunctionMode) {
-                        inFunctionEvaluationMode = true;
-                        functionsGrammar = new FunctionCallGrammar(
-                            model._llama,
-                            functions as NonNullable<Functions>,
-                            this._chatWrapper,
-                            true
-                        );
-                        functionsEvaluationState = new LlamaGrammarEvaluationState({
-                            grammar: functionsGrammar
-                        });
+                    if (generateResponseState.functionsEnabled) {
+                        generateResponseState.initFunctions();
                     }
                 }
-            }
 
-            const tokens = [...contextWindowTokens, ...ignoredStartTextTokens, ...pendingTokens, ...queuedChunkTokens];
-            resolvedHistory = newResolvedHistory;
-            lastHistoryCompressionMetadata = newHistoryCompressionMetadata;
-            lastContextWindowHistory = contextWindowHistory;
-            const contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(contextWindowHistory);
-            const contextWindowsRes: Token[] = [];
+                generateResponseState.addStopGenerationTriggersFromChatWrapper();
+                await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
 
-            StopGenerationDetector.resolveStopTriggers(stopGenerationTriggers, model.tokenizer)
-                .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
+                await generateResponseState.createNewEvaluationIterator();
+                while (await generateResponseState.iterateEvaluation()) {
+                    generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
 
-            if (functionsGrammar != null)
-                StopGenerationDetector.resolveStopTriggers(functionsGrammar.stopGenerationTriggers, model.tokenizer)
-                    .map((stopTrigger) => functionSyntaxEndDetector.addStopTrigger(stopTrigger));
+                    generateResponseState.trackGenerationForDisengageInitiallyEngagedFunctionMode();
+                    generateResponseState.trackFunctionSyntaxStart();
 
-            let {firstDifferentIndex} = this._sequence.compareContextTokens(tokens);
+                    generateResponseState.handleInitiallyEngagedFunctionModeFunctionDetection();
+                    generateResponseState.handleFunctionSyntax();
 
-            // we need to decode at least one token to generate a response
-            if (firstDifferentIndex === tokens.length && firstDifferentIndex > 0)
-                firstDifferentIndex -= 1;
+                    const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax();
+                    if (functionEndSyntaxRes != null)
+                        return functionEndSyntaxRes;
 
-            tokens.splice(0, firstDifferentIndex);
+                    generateResponseState.recordStopGenerationEvaluation();
 
-            if (firstDifferentIndex < this._sequence.nextTokenIndex) {
-                await this._sequence.eraseContextTokenRanges([{
-                    start: firstDifferentIndex,
-                    end: this._sequence.nextTokenIndex
-                }]);
-                ensureNotAborted();
-            }
+                    generateResponseState.popStreamRegulatorFreeTokens();
+                    generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
 
+                    const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
+                    if (stopGenerationTriggerRes != null)
+                        return stopGenerationTriggerRes;
 
-            const evaluationIterator = this._sequence.evaluate(tokens, removeNullFields({
-                temperature, minP, topK, topP,
-                grammarEvaluationState: () => {
-                    if (inFunctionEvaluationMode)
-                        return functionsEvaluationState;
+                    generateResponseState.spliceIgnoreStartTextDetectedTokens();
 
-                    return grammarEvaluationState;
-                },
-                repeatPenalty: !repeatPenaltyEnabled ? undefined : {
-                    punishTokens: getPenaltyTokens,
-                    penalty,
-                    frequencyPenalty,
-                    presencePenalty
-                },
-                tokenBias,
-                evaluationPriority,
-                yieldEogToken: true
-            }));
-
-            try {
-                let currentIteration = await evaluationIterator.next();
-                while (currentIteration.done !== true) {
-                    const token = currentIteration.value;
-                    let replacementToken: Token | undefined = undefined;
-
-                    ensureNotAborted();
-                    generatedTokens++;
-
-                    const tokens = [token];
-                    const text = model.detokenize([token]);
-                    const queuedTokenRelease = streamRegulator.addChunk({tokens, text});
-
-                    if (initiallyEngagedFunctionMode)
-                        disengageInitiallyEngagedFunctionMode.recordGeneration({text, tokens, startNewChecks: generatedTokens === 1});
-
-                    if (text === UNKNOWN_UNICODE_CHAR || (
-                        (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) && text.trim() === ""
-                    )) {
-                        locksToReleaseOnValidGeneration.push(queuedTokenRelease.createTextIndexLock(0));
-                    } else {
-                        while (locksToReleaseOnValidGeneration.length > 0)
-                            locksToReleaseOnValidGeneration.shift()!.dispose();
-                    }
+                    generateResponseState.moveFreePendingTokensToRes();
 
-                    functionSyntaxStartDetector.recordGeneration({text, tokens, queuedTokenRelease});
-
-                    if (initiallyEngagedFunctionMode && disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
-                        initiallyEngagedFunctionMode = false;
-
-                        let shouldStopFunctionEvaluationMode = !functionSyntaxStartDetector.hasTriggeredStops;
-
-                        if (!shouldStopFunctionEvaluationMode && functionsEnabled && functionsGrammar != null) {
-                            const functionCallText = model.detokenize([...functionCallTokens, ...tokens]);
-
-                            try {
-                                const functionName = functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
-                                    enableInternalBuiltinFunctions: true,
-                                    initialFunctionCallEngaged: true
-                                });
-
-                                const internalBuiltinFunctions =
-                                    this._chatWrapper.getInternalBuiltinFunctions({initialFunctionCallEngaged: true});
-                                if (internalBuiltinFunctions[functionName] != null) {
-                                    shouldStopFunctionEvaluationMode = true;
-                                }
-                            } catch (err) {
-                                if (!(err instanceof LlamaFunctionCallValidationError))
-                                    throw err;
-                            }
-                        }
-
-                        if (shouldStopFunctionEvaluationMode) {
-                            inFunctionEvaluationMode = false;
-                            functionsGrammar = new FunctionCallGrammar(
-                                model._llama,
-                                functions as NonNullable<Functions>,
-                                this._chatWrapper, false
-                            );
-                            functionsEvaluationState = new LlamaGrammarEvaluationState({
-                                grammar: functionsGrammar
-                            });
-
-                            functionCallTokens.length = 0;
-
-                            while (functionCallTokenSyntaxLocks.length > 0)
-                                functionCallTokenSyntaxLocks.shift()!.dispose();
-
-                            functionSyntaxStartDetector.clearInProgressStops();
-                            functionSyntaxStartDetector.clearTriggeredStops();
-
-                            functionSyntaxEndDetector.clearInProgressStops();
-                            functionSyntaxEndDetector.clearTriggeredStops();
-                        }
-                    }
-
-                    if (!inFunctionEvaluationMode && functionsEnabled && functionsGrammar != null &&
-                        functionSyntaxStartDetector.hasTriggeredStops && functionsEvaluationState != null
-                    ) {
-                        inFunctionEvaluationMode = true;
-                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-
-                        stopGenerationDetector.clearTriggeredStops();
-                        stopGenerationDetector.clearInProgressStops();
-                        customStopGenerationTriggersDetector.clearTriggeredStops();
-                        customStopGenerationTriggersDetector.clearInProgressStops();
-
-                        pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-
-                        const triggeredStops = functionSyntaxStartDetector.getTriggeredStops();
-                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
-
-                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
-                            triggeredStops,
-                            partiallyFreeTokens,
-                            model.tokenizer
-                        );
-                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
-
-                        const [firstRemainingGenerationAfterStop] = triggeredStops
-                            .map((stopTrigger) => stopTrigger.remainingGenerations)
-                            .filter((remainingGenerations) => remainingGenerations.length > 0)
-                            .flat(1);
-
-                        const remainingTextAfterStop =
-                            (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
-                                ? ""
-                                : typeof firstRemainingGenerationAfterStop === "string"
-                                    ? firstRemainingGenerationAfterStop
-                                    : model.detokenize(firstRemainingGenerationAfterStop);
-
-                        functionCallTokens.push(...model.tokenize(this._chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
-
-                        for (const functionCallToken of functionCallTokens)
-                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, functionCallToken);
-
-                        // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
-                        // or the context state should be modified to not include the incompatible tokens
-                        const remainingTextTokens = model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
-                        let unfitTokens: Token[] = [];
-
-                        for (let i = 0; i < remainingTextTokens.length; i++) {
-                            const remainingToken = remainingTextTokens[i];
-                            const canBeNextToken = context._canBeNextTokenForGrammarEvaluationState(
-                                functionsEvaluationState,
-                                remainingToken
-                            );
-
-                            if (!canBeNextToken) {
-                                unfitTokens = remainingTextTokens.slice(i);
-                                break;
-                            }
-
-                            context._acceptTokenOnGrammarEvaluationState(functionsEvaluationState, remainingToken);
-                            functionCallTokens.push(remainingToken);
-                        }
-
-                        if (unfitTokens.length > 0) {
-                            const unfitTokensText = model.detokenize(unfitTokens); // the current token text must end with it
-                            const currentTokenText = queuedTokenRelease.text;
-                            let replacementTokens: Token[];
-
-                            if (!currentTokenText.endsWith(unfitTokensText)) {
-                                console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
-                                replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
-                            } else {
-                                const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
-                                replacementTokens = model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
-                            }
-
-                            if (replacementTokens.length > 0) {
-                                replacementToken = replacementTokens[0];
-                                queuedTokenRelease.modifyTokensAndText(replacementTokens, model.detokenize([replacementToken]));
-                            }
-                        }
-                    } else if (inFunctionEvaluationMode) {
-                        functionCallTokens.push(...tokens);
-                        functionCallTokenSyntaxLocks.push(queuedTokenRelease.createTextIndexLock(0));
-                        functionSyntaxEndDetector.recordGeneration({text, tokens, queuedTokenRelease});
-                    }
-
-                    if (inFunctionEvaluationMode && functionSyntaxEndDetector.hasTriggeredStops && functionsGrammar != null) {
-                        const functionCallText = model.detokenize(functionCallTokens);
-                        const functionCall = functionsGrammar.parseFunctionCall(functionCallText);
-
-                        let modelResponse = model.detokenize(res);
-                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-
-                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                            modelResponse = modelResponse.trimEnd();
-                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                        }
-
-                        return {
-                            response: modelResponse,
-                            lastEvaluation: {
-                                contextWindow: setLastModelTextResponseInChatHistory(
-                                    lastContextWindowHistory,
-                                    contextWindowLastModelResponse + contextWindowModelResponse
-                                ),
-                                cleanHistory: setLastModelTextResponseInChatHistory(
-                                    resolvedHistory,
-                                    lastModelResponse + modelResponse
-                                ),
-                                contextShiftMetadata: lastHistoryCompressionMetadata
-                            },
-
-                            // prevent infinite TS type instantiation
-                            functionCall: functionCall satisfies LlamaChatResponseFunctionCall<NonNullable<Functions>> as any,
-
-                            metadata: {
-                                stopReason: "functionCall"
-                            }
-                        };
-                    }
-
-                    if (!inFunctionEvaluationMode) {
-                        stopGenerationDetector.recordGeneration({text, tokens, queuedTokenRelease});
-                        customStopGenerationTriggersDetector.recordGeneration({text, tokens, queuedTokenRelease});
-                    }
-
-                    pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-
-                    removeFoundStartIgnoreTextsFromPendingTokens();
-
-                    if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
-                        model.isEogToken(token)
-                    ) {
-                        stopGenerationDetector.clearInProgressStops();
-                        customStopGenerationTriggersDetector.clearInProgressStops();
-                        pendingTokens.push(...streamRegulator.popFreeChunkTokens());
-
-                        const triggeredStops = stopGenerationDetector.hasTriggeredStops
-                            ? stopGenerationDetector.getTriggeredStops()
-                            : customStopGenerationTriggersDetector.getTriggeredStops();
-
-                        const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
-
-                        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
-                            triggeredStops,
-                            partiallyFreeTokens,
-                            model.tokenizer
-                        );
-                        pendingTokens.push(...queuedTokensBeforeStopTrigger);
-
-                        const [firstRemainingGenerationAfterStop] = triggeredStops
-                            .map((stopTrigger) => stopTrigger.remainingGenerations)
-                            .filter((remainingGenerations) => remainingGenerations.length > 0)
-                            .flat(1);
-
-                        removeFoundStartIgnoreTextsFromPendingTokens();
-
-                        if (pendingTokens.length > 0)
-                            onToken?.(pendingTokens.slice());
-
-                        res.push(...pendingTokens);
-                        contextWindowsRes.push(...pendingTokens);
-                        pendingTokens.length = 0;
-
-                        let modelResponse = model.detokenize(res);
-                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-
-                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                            modelResponse = modelResponse.trimEnd();
-                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                        }
-
-                        const lastEvaluation = {
-                            contextWindow: setLastModelTextResponseInChatHistory(
-                                lastContextWindowHistory,
-                                contextWindowLastModelResponse + contextWindowModelResponse
-                            ),
-                            cleanHistory: setLastModelTextResponseInChatHistory(
-                                resolvedHistory,
-                                lastModelResponse + modelResponse
-                            ),
-                            contextShiftMetadata: lastHistoryCompressionMetadata
-                        };
-                        const isEogToken = model.isEogToken(token);
-
-                        if (isEogToken || stopGenerationDetector.hasTriggeredStops) {
-                            return {
-                                response: modelResponse,
-                                lastEvaluation,
-                                metadata: {
-                                    remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
-                                    stopReason: isEogToken
-                                        ? "eogToken"
-                                        : "stopGenerationTrigger"
-                                }
-                            };
-                        }
-
-                        return {
-                            response: modelResponse,
-                            lastEvaluation,
-                            metadata: {
-                                remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
-                                stopReason: "customStopTrigger",
-                                customStopTrigger: triggeredStops[0].stopTrigger
-                            }
-                        };
-                    }
-
-                    const maxTokensTriggered = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
-
-                    if (res.length === 0) {
-                        ignoreStartTextDetector.clearInProgressStops();
-                        ignoreStartTextDetector.clearTriggeredStops();
+                    const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
+                    if (maxTokensTriggerRes != null)
+                        return maxTokensTriggerRes;
 
-                        ignoreStartTextDetector.recordGeneration({
-                            text: model.detokenize(pendingTokens),
-                            tokens: pendingTokens
-                        });
-                    }
-
-                    if (pendingTokens.length > 0 && (maxTokensTriggered || !ignoreStartTextDetector.hasInProgressStops)) {
-                        removeFoundStartIgnoreTextsFromPendingTokens();
-
-                        if (pendingTokens.length > 0) {
-                            onToken?.(pendingTokens.slice());
-                            res.push(...pendingTokens);
-                            contextWindowsRes.push(...pendingTokens);
-                            pendingTokens.length = 0;
-                        }
-                    }
-
-                    if (maxTokensTriggered) {
-                        let modelResponse = model.detokenize(res);
-                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-
-                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                            modelResponse = modelResponse.trimEnd();
-                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                        }
-
-                        return {
-                            response: modelResponse,
-                            lastEvaluation: {
-                                contextWindow: setLastModelTextResponseInChatHistory(
-                                    lastContextWindowHistory,
-                                    contextWindowLastModelResponse + contextWindowModelResponse
-                                ),
-                                cleanHistory: setLastModelTextResponseInChatHistory(
-                                    resolvedHistory,
-                                    lastModelResponse + modelResponse
-                                ),
-                                contextShiftMetadata: lastHistoryCompressionMetadata
-                            },
-                            metadata: {
-                                stopReason: "maxTokens"
-                            }
-                        };
-                    }
-
-                    if (this._sequence.nextTokenIndex >= context.contextSize - 1) {
-                        shouldContextShift = true;
+                    if (generateResponseState.updateShouldContextShift())
                         break;
-                    }
 
-                    if (signal?.aborted && stopOnAbortSignal) {
-                        if (res.length === 0)
-                            throw signal.reason;
-
-                        let modelResponse = model.detokenize(res);
-                        let contextWindowModelResponse = model.detokenize(contextWindowsRes);
-
-                        if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) {
-                            modelResponse = modelResponse.trimEnd();
-                            contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-                        }
-
-                        return {
-                            response: modelResponse,
-                            lastEvaluation: {
-                                contextWindow: setLastModelTextResponseInChatHistory(
-                                    lastContextWindowHistory,
-                                    contextWindowLastModelResponse + contextWindowModelResponse
-                                ),
-                                cleanHistory: setLastModelTextResponseInChatHistory(
-                                    resolvedHistory,
-                                    lastModelResponse + modelResponse
-                                ),
-                                contextShiftMetadata: lastHistoryCompressionMetadata
-                            },
-                            metadata: {
-                                stopReason: "abort"
-                            }
-                        };
-                    }
-                    currentIteration = await evaluationIterator.next(replacementToken);
+                    const abortRes = generateResponseState.handleAbortTrigger();
+                    if (abortRes != null)
+                        return abortRes;
                 }
-            } finally {
-                await evaluationIterator.return();
-            }
 
-            isFirstEvaluation = false;
+                generateResponseState.isFirstEvaluation = false;
 
-            if (shouldContextShift)
-                continue;
+                if (generateResponseState.shouldContextShift)
+                    continue;
 
-            break;
-        }
+                break;
+            }
 
-        throw new Error("The context size is too small to generate a response");
+            throw new Error("The context size is too small to generate a response");
+        } finally {
+            generateResponseState.dispose();
+        }
     }
 }
 
@@ -1303,3 +745,886 @@ async function getContextWindow({
         disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? []
     };
 }
+
+class GenerateResponseState<const Functions extends ChatModelFunctions | undefined = undefined> {
+    private readonly llamaChat: LlamaChat;
+    private readonly chatWrapper: ChatWrapper;
+
+    private readonly history: ChatHistoryItem[];
+    private readonly onToken: LLamaChatGenerateResponseOptions<Functions>["onToken"];
+    private readonly signal: LLamaChatGenerateResponseOptions<Functions>["signal"];
+    private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions<Functions>["stopOnAbortSignal"];
+    private readonly maxTokens: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
+    private readonly temperature: LLamaChatGenerateResponseOptions<Functions>["temperature"];
+    private readonly minP: LLamaChatGenerateResponseOptions<Functions>["minP"];
+    private readonly topK: LLamaChatGenerateResponseOptions<Functions>["topK"];
+    private readonly topP: LLamaChatGenerateResponseOptions<Functions>["topP"];
+    private readonly grammar: LLamaChatGenerateResponseOptions<Functions>["grammar"];
+    private readonly trimWhitespaceSuffix: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
+    private readonly tokenBias: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
+    private readonly evaluationPriority: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
+    private readonly functions: LLamaChatGenerateResponseOptions<Functions>["functions"];
+    private readonly documentFunctionParams: LLamaChatGenerateResponseOptions<Functions>["documentFunctionParams"];
+    private readonly contextShift: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
+    private readonly customStopTriggers: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
+    private readonly lastEvaluationContextWindowHistory: Exclude<LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"], undefined>["history"];
+    private readonly minimumOverlapPercentageToPreventContextShift: Exclude<Exclude<LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"], undefined>["minimumOverlapPercentageToPreventContextShift"], undefined>;
+
+    public readonly functionsEnabled: boolean;
+    private readonly repeatPenaltyEnabled: boolean;
+    private readonly resolvedContextShift: Required<LLamaChatContextShiftOptions>;
+    private readonly resolvedRepeatPenalty: LLamaContextualRepeatPenalty & {
+        lastTokens: number
+    };
+    private readonly lastModelResponse: string;
+    private readonly grammarEvaluationState: LlamaGrammarEvaluationState | undefined;
+    private functionsGrammar: FunctionCallGrammar<NonNullable<Functions>> | undefined;
+    private functionsEvaluationState: LlamaGrammarEvaluationState | undefined;
+
+    private readonly streamRegulator = new TokenStreamRegulator();
+    private readonly stopGenerationDetector = new StopGenerationDetector();
+    private readonly customStopGenerationTriggersDetector = new StopGenerationDetector();
+    private readonly functionSyntaxStartDetector = new StopGenerationDetector();
+    private readonly functionSyntaxEndDetector = new StopGenerationDetector();
+    private readonly disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
+    private readonly ignoreStartTextDetector = new StopGenerationDetector();
+    private readonly locksToReleaseOnValidGeneration: QueuedTokenReleaseLock[] = [];
+    private readonly functionCallTokenSyntaxLocks: QueuedTokenReleaseLock[] = [];
+
+    public resolvedHistory: ChatHistoryItem[];
+
+    public readonly res: Token[] = [];
+    public readonly pendingTokens: Token[] = [];
+    public ignoredStartTextTokens: Token[] = [];
+    public readonly functionCallTokens: Token[] = [];
+
+    public generatedTokens = 0;
+    public isFirstEvaluation = true;
+    public inFunctionEvaluationMode = false;
+    public initiallyEngagedFunctionMode = false;
+    public lastContextWindowHistory: ChatHistoryItem[];
+    public lastHistoryCompressionMetadata: object | null | undefined;
+
+    // context shift loop
+    public shouldContextShift = false;
+    public queuedChunkTokens: Token[] = [];
+
+    public contextWindowHistory: ChatHistoryItem[] = [];
+    public stopGenerationTriggers: LlamaText[] = [];
+    public contextWindowTokens: Token[] = [];
+    public newResolvedHistory: ChatHistoryItem[] = [];
+    public newHistoryCompressionMetadata: object | null | undefined = undefined;
+    public ignoreStartText: LlamaText[] = [];
+    public functionCallInitiallyEngaged: boolean = false;
+    public disengageInitiallyEngagedFunctionCall: LlamaText[] = [];
+
+    public tokens: Token[] = [];
+    public contextWindowLastModelResponse: string = "";
+    public contextWindowsRes: Token[] = [];
+
+    // token evaluation loop
+    public evaluationIterator?: AsyncGenerator<Token, void | Token>;
+    public currentIteration?:  IteratorResult<Token, void | Token>;
+    public currentIterationReplacementToken?: Token;
+    public currentToken?: Token;
+    public currentTokens: Token[] = [];
+    public currentText: string = "";
+    public currentQueuedTokenRelease?: QueuedTokenRelease;
+
+    public constructor(
+        llamaChat: LlamaChat,
+        chatWrapper: ChatWrapper,
+        history: ChatHistoryItem[],
+        {
+            onToken,
+            signal,
+            stopOnAbortSignal = false,
+            maxTokens,
+            temperature,
+            minP,
+            topK,
+            topP,
+            grammar,
+            trimWhitespaceSuffix = false,
+            repeatPenalty = {},
+            tokenBias,
+            evaluationPriority = 5,
+            functions,
+            documentFunctionParams,
+            contextShift = defaultContextShiftOptions,
+            customStopTriggers,
+            lastEvaluationContextWindow: {
+                history: lastEvaluationContextWindowHistory,
+                minimumOverlapPercentageToPreventContextShift = 0.5
+            } = {}
+        }: LLamaChatGenerateResponseOptions<Functions> = {}
+    ) {
+        this.llamaChat = llamaChat;
+        this.chatWrapper = chatWrapper;
+
+        this.history = history;
+        this.onToken = onToken;
+        this.signal = signal;
+        this.stopOnAbortSignal = stopOnAbortSignal;
+        this.maxTokens = maxTokens;
+        this.temperature = temperature;
+        this.minP = minP;
+        this.topK = topK;
+        this.topP = topP;
+        this.grammar = grammar;
+        this.trimWhitespaceSuffix = trimWhitespaceSuffix;
+        this.tokenBias = tokenBias;
+        this.evaluationPriority = evaluationPriority;
+        this.functions = functions;
+        this.documentFunctionParams = documentFunctionParams;
+        this.contextShift = contextShift;
+        this.customStopTriggers = customStopTriggers;
+        this.lastEvaluationContextWindowHistory = lastEvaluationContextWindowHistory;
+        this.minimumOverlapPercentageToPreventContextShift = minimumOverlapPercentageToPreventContextShift;
+
+        this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0);
+
+        if (this.grammar != null && this.functionsEnabled)
+            throw new Error("Using both grammar and functions is not supported yet");
+
+        if (this.signal?.aborted)
+            throw this.signal.reason;
+
+        if (this.llamaChat.disposed)
+            throw new DisposedError();
+
+        this.resolvedHistory = this.llamaChat.sequence.isLoadedToMemory
+            ? this.history.slice()
+            : this.history.map(removeRawFromHistoryItem);
+        this.resolvedContextShift = {
+            ...defaultContextShiftOptions,
+            ...removeNullFields(this.contextShift)
+        };
+        this.resolvedRepeatPenalty = repeatPenalty === false
+            ? {lastTokens: 0}
+            : {
+                ...(repeatPenalty ?? {}),
+                lastTokens: repeatPenalty?.lastTokens ?? defaultRepeatPenaltyLastTokens
+            };
+        this.lastModelResponse = getLastTextModelResponseFromChatHistory(this.resolvedHistory);
+        this.repeatPenaltyEnabled = this.resolvedRepeatPenalty.lastTokens > 0;
+        this.grammarEvaluationState = this.grammar != null
+            ? new LlamaGrammarEvaluationState({grammar: this.grammar})
+            : undefined;
+        this.functionsGrammar = this.functionsEnabled
+            ? new FunctionCallGrammar(this.llamaChat.model._llama, this.functions as NonNullable<Functions>, this.chatWrapper, false)
+            : undefined;
+        this.functionsEvaluationState = (this.functionsEnabled && this.functionsGrammar != null)
+            ? new LlamaGrammarEvaluationState({
+                grammar: this.functionsGrammar
+            })
+            : undefined;
+
+        this.lastContextWindowHistory = this.resolvedHistory;
+        this.lastHistoryCompressionMetadata = this.resolvedContextShift;
+
+        if (this.customStopTriggers != null)
+            StopGenerationDetector.resolveStopTriggers(this.customStopTriggers, this.llamaChat.model.tokenizer)
+                .map((stopTrigger) => this.customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
+
+        if (this.grammar != null)
+            StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
+                .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
+
+        if (this.functions != null && Object.keys(this.functions).length > 0)
+            this.functionSyntaxStartDetector.addStopTrigger([this.chatWrapper.settings.functions.call.prefix]);
+
+        this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
+    }
+
+    public dispose() {
+
+    }
+
+    public [Symbol.dispose]() {
+        this.dispose();
+    }
+
+    public ensureLastHistoryItemIsModel() {
+        if (this.resolvedHistory.length === 0 || this.resolvedHistory[this.resolvedHistory.length - 1].type !== "model")
+            this.resolvedHistory.push({
+                type: "model",
+                response: []
+            });
+    }
+
+    public ensureLastHistoryItemIsUser() {
+        if (this.resolvedHistory.length === 0 || this.resolvedHistory[this.resolvedHistory.length - 1].type !== "user")
+            this.resolvedHistory.push({
+                type: "user",
+                text: ""
+            });
+    }
+
+    public ensureNotAborted() {
+        if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0))
+            throw this.signal.reason;
+
+        if (this.llamaChat.disposed)
+            throw new DisposedError();
+    }
+
+    public getPenaltyTokens() {
+        if (this.llamaChat.disposed)
+            throw new DisposedError();
+
+        let punishTokens = this.res.slice(-this.resolvedRepeatPenalty.lastTokens);
+
+        if (this.resolvedRepeatPenalty.punishTokensFilter != null)
+            punishTokens = this.resolvedRepeatPenalty.punishTokensFilter(punishTokens);
+
+        if (this.resolvedRepeatPenalty.penalizeNewLine == null || !this.resolvedRepeatPenalty.penalizeNewLine) {
+            const nlToken = this.llamaChat.model.tokens.nl;
+
+            if (nlToken != null)
+                punishTokens = punishTokens.filter(token => token !== nlToken);
+        }
+
+        return punishTokens;
+    }
+
+    public getResolvedHistoryWithCurrentModelResponse() {
+        if (this.res.length === 0)
+            return this.resolvedHistory;
+
+        let modelResponse = this.llamaChat.model.detokenize(this.res);
+
+        if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix)
+            modelResponse = modelResponse.trimEnd();
+
+        if (modelResponse === "")
+            return this.resolvedHistory;
+
+        return setLastModelTextResponseInChatHistory(
+            this.resolvedHistory,
+            this.lastModelResponse + modelResponse
+        );
+    }
+
+    public removeFoundStartIgnoreTextsFromPendingTokens() {
+        if (this.res.length === 0 && this.pendingTokens.length > 0) {
+            this.ignoreStartTextDetector.clearInProgressStops();
+            this.ignoreStartTextDetector.clearTriggeredStops();
+
+            let mostExhaustiveTriggeredStops: ReturnType<typeof this.ignoreStartTextDetector.getTriggeredStops> | null = null;
+
+            for (let i = 0; i < this.pendingTokens.length; i++) {
+                this.ignoreStartTextDetector.recordGeneration({
+                    text: this.llamaChat.model.detokenize([this.pendingTokens[i]]),
+                    tokens: [this.pendingTokens[i]],
+                    startNewChecks: i === 0
+                });
+
+                if (this.ignoreStartTextDetector.hasTriggeredStops) {
+                    mostExhaustiveTriggeredStops = this.ignoreStartTextDetector.getTriggeredStops();
+                    this.ignoreStartTextDetector.clearTriggeredStops();
+                } else if (!this.ignoreStartTextDetector.hasInProgressStops)
+                    break;
+            }
+
+            if (mostExhaustiveTriggeredStops != null) {
+                const [mostExhaustiveTriggeredStop] = mostExhaustiveTriggeredStops;
+
+                if (mostExhaustiveTriggeredStop != null) {
+                    this.ignoredStartTextTokens = mostExhaustiveTriggeredStop.stopTrigger
+                        .map((stopTrigger) => {
+                            if (typeof stopTrigger === "string")
+                                return this.llamaChat.model.tokenize(stopTrigger, false, "trimLeadingSpace");
+                            else
+                                return [stopTrigger];
+                        })
+                        .flat(1);
+
+                    const newPendingTokens = mostExhaustiveTriggeredStop.remainingGenerations
+                        .map((generation) => {
+                            if (typeof generation === "string")
+                                return this.llamaChat.model.tokenize(generation, false, "trimLeadingSpace");
+                            else
+                                return generation;
+                        })
+                        .flat(1);
+                    this.pendingTokens.length = 0;
+                    this.pendingTokens.push(...newPendingTokens);
+                }
+            }
+        }
+    }
+
+    public startTokenLoop() {
+        this.ensureNotAborted();
+        this.shouldContextShift = false;
+        this.queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
+    }
+
+    public async loadContextWindow() {
+        const {
+            history: contextWindowHistory,
+            stopGenerationTriggers,
+            tokens: contextWindowTokens,
+            newResolvedHistory,
+            newHistoryCompressionMetadata,
+            ignoreStartText,
+            functionCallInitiallyEngaged,
+            disengageInitiallyEngagedFunctionCall
+        } = await getContextWindow({
+            resolvedHistory: this.getResolvedHistoryWithCurrentModelResponse(),
+            resolvedContextShift: this.resolvedContextShift,
+            lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
+            pendingTokensCount: this.ignoredStartTextTokens.length + this.pendingTokens.length + this.queuedChunkTokens.length,
+            isFirstEvaluation: this.isFirstEvaluation,
+            chatWrapper: this.chatWrapper,
+            lastEvaluationContextWindowHistory: this.lastEvaluationContextWindowHistory,
+            minimumOverlapPercentageToPreventContextShift: this.minimumOverlapPercentageToPreventContextShift,
+            sequence: this.llamaChat.sequence,
+            minFreeContextTokens: 1,
+            functions: this.functionsEnabled ? this.functions : undefined,
+            documentFunctionParams: this.documentFunctionParams
+        });
+
+        this.contextWindowHistory = contextWindowHistory;
+        this.stopGenerationTriggers = stopGenerationTriggers;
+        this.contextWindowTokens = contextWindowTokens;
+        this.newResolvedHistory = newResolvedHistory;
+        this.newHistoryCompressionMetadata = newHistoryCompressionMetadata;
+        this.ignoreStartText = ignoreStartText;
+        this.functionCallInitiallyEngaged = functionCallInitiallyEngaged;
+        this.disengageInitiallyEngagedFunctionCall = disengageInitiallyEngagedFunctionCall;
+
+        this.ensureNotAborted();
+
+        this.tokens = [...this.contextWindowTokens, ...this.ignoredStartTextTokens, ...this.pendingTokens, ...this.queuedChunkTokens];
+        this.resolvedHistory = this.newResolvedHistory;
+        this.lastHistoryCompressionMetadata = this.newHistoryCompressionMetadata;
+        this.lastContextWindowHistory = this.contextWindowHistory;
+        this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(this.contextWindowHistory);
+        this.contextWindowsRes = [];
+    }
+
+    public addIgnoreStartTextTriggersFromChatWrapper() {
+        StopGenerationDetector.resolveStopTriggers(this.ignoreStartText, this.llamaChat.model.tokenizer)
+            .map((stopTrigger) => this.ignoreStartTextDetector.addStopTrigger(stopTrigger));
+    }
+
+    public addFunctionSyntaxEndTriggersFromFunctionsGrammar() {
+        if (this.functionsGrammar != null)
+            StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
+                .map((stopTrigger) => this.functionSyntaxEndDetector.addStopTrigger(stopTrigger));
+    }
+
+    public addStopGenerationTriggersFromChatWrapper() {
+        StopGenerationDetector.resolveStopTriggers(this.stopGenerationTriggers, this.llamaChat.model.tokenizer)
+            .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
+    }
+
+    public initFunctions() {
+        this.initiallyEngagedFunctionMode = this.functionCallInitiallyEngaged;
+        StopGenerationDetector.resolveStopTriggers(this.disengageInitiallyEngagedFunctionCall, this.llamaChat.model.tokenizer)
+            .map((stopTrigger) => this.disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
+
+        if (this.initiallyEngagedFunctionMode) {
+            this.inFunctionEvaluationMode = true;
+            this.functionsGrammar = new FunctionCallGrammar(
+                this.llamaChat.model._llama,
+                this.functions as NonNullable<Functions>,
+                this.chatWrapper,
+                true
+            );
+            this.functionsEvaluationState = new LlamaGrammarEvaluationState({
+                grammar: this.functionsGrammar
+            });
+        }
+    }
+
+    public async alignCurrentSequenceStateWithCurrentTokens() {
+        let {firstDifferentIndex} = this.llamaChat.sequence.compareContextTokens(this.tokens);
+
+        // we need to decode at least one token to generate a response
+        if (firstDifferentIndex === this.tokens.length && firstDifferentIndex > 0)
+            firstDifferentIndex -= 1;
+
+        this.tokens.splice(0, firstDifferentIndex);
+
+        if (firstDifferentIndex < this.llamaChat.sequence.nextTokenIndex) {
+            await this.llamaChat.sequence.eraseContextTokenRanges([{
+                start: firstDifferentIndex,
+                end: this.llamaChat.sequence.nextTokenIndex
+            }]);
+            this.ensureNotAborted();
+        }
+    }
+
+    public async createNewEvaluationIterator() {
+        if (this.evaluationIterator != null)
+            await this.evaluationIterator.return();
+
+        this.currentIterationReplacementToken = undefined;
+        this.evaluationIterator = this.llamaChat.sequence.evaluate(this.tokens, removeNullFields({
+            temperature: this.temperature,
+            minP: this.minP,
+            topK: this.topK,
+            topP: this.topP,
+            grammarEvaluationState: () => {
+                if (this.inFunctionEvaluationMode)
+                    return this.functionsEvaluationState;
+
+                return this.grammarEvaluationState;
+            },
+            repeatPenalty: !this.repeatPenaltyEnabled ? undefined : {
+                punishTokens: this.getPenaltyTokens,
+                penalty: this.resolvedRepeatPenalty.penalty,
+                frequencyPenalty: this.resolvedRepeatPenalty.frequencyPenalty,
+                presencePenalty: this.resolvedRepeatPenalty.presencePenalty
+            },
+            tokenBias: this.tokenBias,
+            evaluationPriority: this.evaluationPriority,
+            yieldEogToken: true
+        }));
+    }
+
+    public async iterateEvaluation() {
+        this.currentIteration = await this.evaluationIterator?.next(this.currentIterationReplacementToken);
+        this.currentIterationReplacementToken = undefined;
+
+        this.ensureNotAborted();
+        this.generatedTokens++;
+
+        if (this.currentIteration != null && this.currentIteration?.done !== true) {
+            this.currentToken = this.currentIteration.value;
+            this.currentTokens = [this.currentToken];
+            this.currentText = this.llamaChat.model.detokenize(this.currentTokens);
+            this.currentQueuedTokenRelease = this.streamRegulator.addChunk({
+                tokens: this.currentTokens,
+                text: this.currentText
+            });
+
+            return true;
+        }
+
+        return false;
+    }
+
+    public waitOnPartialCharactersOrWhiteSpaceTokens() {
+        if (this.currentText === UNKNOWN_UNICODE_CHAR || (
+            (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) && this.currentText?.trim() === ""
+        )) {
+            if (this.currentQueuedTokenRelease != null)
+                this.locksToReleaseOnValidGeneration.push(this.currentQueuedTokenRelease.createTextIndexLock(0));
+        } else {
+            while (this.locksToReleaseOnValidGeneration.length > 0)
+                this.locksToReleaseOnValidGeneration.shift()!.dispose();
+        }
+    }
+
+    public trackGenerationForDisengageInitiallyEngagedFunctionMode() {
+        if (this.initiallyEngagedFunctionMode)
+            this.disengageInitiallyEngagedFunctionMode.recordGeneration({
+                text: this.currentText,
+                tokens: this.currentTokens,
+                startNewChecks: this.generatedTokens === 1
+            });
+    }
+
+    public trackFunctionSyntaxStart() {
+        this.functionSyntaxStartDetector.recordGeneration({
+            text: this.currentText,
+            tokens: this.currentTokens,
+            queuedTokenRelease: this.currentQueuedTokenRelease
+        });
+    }
+
+    public handleInitiallyEngagedFunctionModeFunctionDetection() {
+        if (this.initiallyEngagedFunctionMode && this.disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
+            this.initiallyEngagedFunctionMode = false;
+
+            let shouldStopFunctionEvaluationMode = !this.functionSyntaxStartDetector.hasTriggeredStops;
+
+            if (!shouldStopFunctionEvaluationMode && this.functionsEnabled && this.functionsGrammar != null) {
+                const functionCallText = this.llamaChat.model.detokenize([...this.functionCallTokens, ...this.currentTokens]);
+
+                try {
+                    const functionName = this.functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
+                        enableInternalBuiltinFunctions: true,
+                        initialFunctionCallEngaged: true
+                    });
+
+                    const internalBuiltinFunctions =
+                        this.chatWrapper.getInternalBuiltinFunctions({initialFunctionCallEngaged: true});
+                    if (internalBuiltinFunctions[functionName] != null) {
+                        shouldStopFunctionEvaluationMode = true;
+                    }
+                } catch (err) {
+                    if (!(err instanceof LlamaFunctionCallValidationError))
+                        throw err;
+                }
+            }
+
+            if (shouldStopFunctionEvaluationMode) {
+                this.inFunctionEvaluationMode = false;
+                this.functionsGrammar = new FunctionCallGrammar(
+                    this.llamaChat.model._llama,
+                    this.functions as NonNullable<Functions>,
+                    this.chatWrapper,
+                    false
+                );
+                this.functionsEvaluationState = new LlamaGrammarEvaluationState({
+                    grammar: this.functionsGrammar
+                });
+
+                this.functionCallTokens.length = 0;
+
+                while (this.functionCallTokenSyntaxLocks.length > 0)
+                    this.functionCallTokenSyntaxLocks.shift()!.dispose();
+
+                this.functionSyntaxStartDetector.clearInProgressStops();
+                this.functionSyntaxStartDetector.clearTriggeredStops();
+
+                this.functionSyntaxEndDetector.clearInProgressStops();
+                this.functionSyntaxEndDetector.clearTriggeredStops();
+            }
+        }
+    }
+
+    public handleFunctionSyntax() {
+        if (this.currentQueuedTokenRelease != null && !this.inFunctionEvaluationMode && this.functionsEnabled &&
+            this.functionsGrammar != null && this.functionSyntaxStartDetector.hasTriggeredStops && this.functionsEvaluationState != null
+        ) {
+            this.inFunctionEvaluationMode = true;
+            this.functionCallTokenSyntaxLocks.push(this.currentQueuedTokenRelease.createTextIndexLock(0));
+
+            this.stopGenerationDetector.clearTriggeredStops();
+            this.stopGenerationDetector.clearInProgressStops();
+            this.customStopGenerationTriggersDetector.clearTriggeredStops();
+            this.customStopGenerationTriggersDetector.clearInProgressStops();
+
+            this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
+
+            const triggeredStops = this.functionSyntaxStartDetector.getTriggeredStops();
+            const partiallyFreeTokens = this.streamRegulator.getPartiallyFreeChunk(this.llamaChat.model.tokenizer);
+
+            const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
+                triggeredStops,
+                partiallyFreeTokens,
+                this.llamaChat.model.tokenizer
+            );
+            this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
+
+            const [firstRemainingGenerationAfterStop] = triggeredStops
+                .map((stopTrigger) => stopTrigger.remainingGenerations)
+                .filter((remainingGenerations) => remainingGenerations.length > 0)
+                .flat(1);
+
+            const remainingTextAfterStop =
+                (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
+                    ? ""
+                    : typeof firstRemainingGenerationAfterStop === "string"
+                        ? firstRemainingGenerationAfterStop
+                        : this.llamaChat.model.detokenize(firstRemainingGenerationAfterStop);
+
+            this.functionCallTokens.push(...this.llamaChat.model.tokenize(this.chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
+
+            for (const functionCallToken of this.functionCallTokens)
+                this.llamaChat.context._acceptTokenOnGrammarEvaluationState(this.functionsEvaluationState, functionCallToken);
+
+            // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
+            // or the context state should be modified to not include the incompatible tokens
+            const remainingTextTokens = this.llamaChat.model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
+            let unfitTokens: Token[] = [];
+
+            for (let i = 0; i < remainingTextTokens.length; i++) {
+                const remainingToken = remainingTextTokens[i];
+                const canBeNextToken = this.llamaChat.context._canBeNextTokenForGrammarEvaluationState(
+                    this.functionsEvaluationState,
+                    remainingToken
+                );
+
+                if (!canBeNextToken) {
+                    unfitTokens = remainingTextTokens.slice(i);
+                    break;
+                }
+
+                this.llamaChat.context._acceptTokenOnGrammarEvaluationState(this.functionsEvaluationState, remainingToken);
+                this.functionCallTokens.push(remainingToken);
+            }
+
+            if (unfitTokens.length > 0) {
+                const unfitTokensText = this.llamaChat.model.detokenize(unfitTokens); // the current token text must end with it
+                const currentTokenText = this.currentQueuedTokenRelease.text;
+                let replacementTokens: Token[];
+
+                if (!currentTokenText.endsWith(unfitTokensText)) {
+                    console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
+                    replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
+                } else {
+                    const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
+                    replacementTokens = this.llamaChat.model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
+                }
+
+                if (replacementTokens.length > 0) {
+                    this.currentIterationReplacementToken = replacementTokens[0];
+                    this.currentQueuedTokenRelease.modifyTokensAndText(
+                        replacementTokens,
+                        this.llamaChat.model.detokenize([this.currentIterationReplacementToken])
+                    );
+                }
+            }
+        } else if (this.inFunctionEvaluationMode) {
+            this.functionCallTokens.push(...this.currentTokens);
+
+            if (this.currentQueuedTokenRelease != null)
+                this.functionCallTokenSyntaxLocks.push(this.currentQueuedTokenRelease.createTextIndexLock(0));
+
+            this.functionSyntaxEndDetector.recordGeneration({
+                text: this.currentText,
+                tokens: this.currentTokens,
+                queuedTokenRelease: this.currentQueuedTokenRelease
+            });
+        }
+    }
+
+    public detectFunctionEndSyntax(): LlamaChatResponse<Functions> | undefined {
+        if (this.inFunctionEvaluationMode && this.functionSyntaxEndDetector.hasTriggeredStops && this.functionsGrammar != null) {
+            const functionCallText = this.llamaChat.model.detokenize(this.functionCallTokens);
+            const functionCall = this.functionsGrammar.parseFunctionCall(functionCallText);
+
+            let modelResponse = this.llamaChat.model.detokenize(this.res);
+            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
+
+            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
+                modelResponse = modelResponse.trimEnd();
+                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+            }
+
+            return {
+                response: modelResponse,
+                lastEvaluation: {
+                    contextWindow: setLastModelTextResponseInChatHistory(
+                        this.lastContextWindowHistory,
+                        this.contextWindowLastModelResponse + contextWindowModelResponse
+                    ),
+                    cleanHistory: setLastModelTextResponseInChatHistory(
+                        this.resolvedHistory,
+                        this.lastModelResponse + modelResponse
+                    ),
+                    contextShiftMetadata: this.lastHistoryCompressionMetadata
+                },
+
+                // prevent infinite TS type instantiation
+                functionCall: functionCall satisfies LlamaChatResponseFunctionCall<NonNullable<Functions>> as any,
+
+                metadata: {
+                    stopReason: "functionCall"
+                }
+            };
+        }
+
+        return undefined;
+    }
+
+    public recordStopGenerationEvaluation() {
+        if (!this.inFunctionEvaluationMode) {
+            this.stopGenerationDetector.recordGeneration({
+                text: this.currentText,
+                tokens: this.currentTokens,
+                queuedTokenRelease: this.currentQueuedTokenRelease
+            });
+            this.customStopGenerationTriggersDetector.recordGeneration({
+                text: this.currentText,
+                tokens: this.currentTokens,
+                queuedTokenRelease: this.currentQueuedTokenRelease
+            });
+        }
+    }
+
+    public popStreamRegulatorFreeTokens() {
+        this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
+    }
+
+    public handleStopGenerationTrigger(): LlamaChatResponse<Functions> | undefined {
+        if (this.stopGenerationDetector.hasTriggeredStops || this.customStopGenerationTriggersDetector.hasTriggeredStops ||
+            this.llamaChat.model.isEogToken(this.currentToken)
+        ) {
+            this.stopGenerationDetector.clearInProgressStops();
+            this.customStopGenerationTriggersDetector.clearInProgressStops();
+            this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
+
+            const triggeredStops = this.stopGenerationDetector.hasTriggeredStops
+                ? this.stopGenerationDetector.getTriggeredStops()
+                : this.customStopGenerationTriggersDetector.getTriggeredStops();
+
+            const partiallyFreeTokens = this.streamRegulator.getPartiallyFreeChunk(this.llamaChat.model.tokenizer);
+
+            const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
+                triggeredStops,
+                partiallyFreeTokens,
+                this.llamaChat.model.tokenizer
+            );
+            this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
+
+            const [firstRemainingGenerationAfterStop] = triggeredStops
+                .map((stopTrigger) => stopTrigger.remainingGenerations)
+                .filter((remainingGenerations) => remainingGenerations.length > 0)
+                .flat(1);
+
+            this.removeFoundStartIgnoreTextsFromPendingTokens();
+
+            if (this.pendingTokens.length > 0)
+                this.onToken?.(this.pendingTokens.slice());
+
+            this.res.push(...this.pendingTokens);
+            this.contextWindowsRes.push(...this.pendingTokens);
+            this.pendingTokens.length = 0;
+
+            let modelResponse = this.llamaChat.model.detokenize(this.res);
+            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
+
+            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
+                modelResponse = modelResponse.trimEnd();
+                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+            }
+
+            const lastEvaluation = {
+                contextWindow: setLastModelTextResponseInChatHistory(
+                    this.lastContextWindowHistory,
+                    this.contextWindowLastModelResponse + contextWindowModelResponse
+                ),
+                cleanHistory: setLastModelTextResponseInChatHistory(
+                    this.resolvedHistory,
+                    this.lastModelResponse + modelResponse
+                ),
+                contextShiftMetadata: this.lastHistoryCompressionMetadata
+            };
+            const isEogToken = this.llamaChat.model.isEogToken(this.currentToken);
+
+            if (isEogToken || this.stopGenerationDetector.hasTriggeredStops) {
+                return {
+                    response: modelResponse,
+                    lastEvaluation,
+                    metadata: {
+                        remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                        stopReason: isEogToken
+                            ? "eogToken"
+                            : "stopGenerationTrigger"
+                    }
+                };
+            }
+
+            return {
+                response: modelResponse,
+                lastEvaluation,
+                metadata: {
+                    remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                    stopReason: "customStopTrigger",
+                    customStopTrigger: triggeredStops[0].stopTrigger
+                }
+            };
+        }
+
+        return undefined;
+    }
+
+    public spliceIgnoreStartTextDetectedTokens() {
+        if (this.res.length === 0) {
+            this.ignoreStartTextDetector.clearInProgressStops();
+            this.ignoreStartTextDetector.clearTriggeredStops();
+
+            this.ignoreStartTextDetector.recordGeneration({
+                text: this.llamaChat.model.detokenize(this.pendingTokens),
+                tokens: this.pendingTokens
+            });
+        }
+    }
+
+    public isMaxTokensTriggered() {
+        return this.maxTokens != null && this.maxTokens > 0 && this.generatedTokens >= this.maxTokens;
+    }
+
+    public moveFreePendingTokensToRes() {
+        if (this.pendingTokens.length > 0 && (this.isMaxTokensTriggered() || !this.ignoreStartTextDetector.hasInProgressStops)) {
+            this.removeFoundStartIgnoreTextsFromPendingTokens();
+
+            if (this.pendingTokens.length > 0) {
+                this.onToken?.(this.pendingTokens.slice());
+                this.res.push(...this.pendingTokens);
+                this.contextWindowsRes.push(...this.pendingTokens);
+                this.pendingTokens.length = 0;
+            }
+        }
+    }
+
+    public handleMaxTokensTrigger(): LlamaChatResponse<Functions> | undefined {
+        if (this.isMaxTokensTriggered()) {
+            let modelResponse = this.llamaChat.model.detokenize(this.res);
+            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
+
+            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
+                modelResponse = modelResponse.trimEnd();
+                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+            }
+
+            return {
+                response: modelResponse,
+                lastEvaluation: {
+                    contextWindow: setLastModelTextResponseInChatHistory(
+                        this.lastContextWindowHistory,
+                        this.contextWindowLastModelResponse + contextWindowModelResponse
+                    ),
+                    cleanHistory: setLastModelTextResponseInChatHistory(
+                        this.resolvedHistory,
+                        this.lastModelResponse + modelResponse
+                    ),
+                    contextShiftMetadata: this.lastHistoryCompressionMetadata
+                },
+                metadata: {
+                    stopReason: "maxTokens"
+                }
+            };
+        }
+
+        return undefined;
+    }
+
+    public updateShouldContextShift() {
+        this.shouldContextShift = this.llamaChat.sequence.nextTokenIndex >= this.llamaChat.context.contextSize - 1;
+        return this.shouldContextShift;
+    }
+
+    public handleAbortTrigger(): LlamaChatResponse<Functions> | undefined {
+        if (this.signal?.aborted && this.stopOnAbortSignal) {
+            if (this.res.length === 0)
+                throw this.signal.reason;
+
+            let modelResponse = this.llamaChat.model.detokenize(this.res);
+            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
+
+            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
+                modelResponse = modelResponse.trimEnd();
+                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+            }
+
+            return {
+                response: modelResponse,
+                lastEvaluation: {
+                    contextWindow: setLastModelTextResponseInChatHistory(
+                        this.lastContextWindowHistory,
+                        this.contextWindowLastModelResponse + contextWindowModelResponse
+                    ),
+                    cleanHistory: setLastModelTextResponseInChatHistory(
+                        this.resolvedHistory,
+                        this.lastModelResponse + modelResponse
+                    ),
+                    contextShiftMetadata: this.lastHistoryCompressionMetadata
+                },
+                metadata: {
+                    stopReason: "abort"
+                }
+            };
+        }
+
+        return undefined;
+    }
+}
diff --git a/src/evaluator/LlamaModel.ts b/src/evaluator/LlamaModel.ts
index 4aea5131..5157158d 100644
--- a/src/evaluator/LlamaModel.ts
+++ b/src/evaluator/LlamaModel.ts
@@ -362,14 +362,17 @@ export class LlamaModel {
     }
 
     /** Check whether the given token is a special token (a control-type token) */
-    public isSpecialToken(token: Token): boolean {
+    public isSpecialToken(token: Token | undefined): boolean {
+        if (token == null)
+            return false;
+
         const tokenType = this.getTokenType(token);
 
         return tokenType === GgufMetadataTokenizerTokenType.control;
     }
 
     /** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
-    public isEogToken(token: Token): boolean {
+    public isEogToken(token: Token | undefined): boolean {
         if (token == null)
             return false;
 

From 86e86ac7992e634379ef5585cf60c3263e5aa152 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Fri, 24 May 2024 19:26:16 +0300
Subject: [PATCH 02/39] feat: preload prompt and complete a preloaded prompt

---
 src/evaluator/LlamaChat/LlamaChat.ts          | 592 ++++++++++++++----
 .../LlamaChatSession/LlamaChatSession.ts      | 173 ++++-
 src/evaluator/LlamaContext/LlamaContext.ts    |   2 -
 src/index.ts                                  |   9 +-
 .../modelDependent/llama3/chatSession.test.ts |  44 ++
 5 files changed, 705 insertions(+), 115 deletions(-)

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 4d03b91b..7d0af385 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -1,7 +1,9 @@
-import {DisposeAggregator, DisposedError, EventRelay} from "lifecycle-utils";
+import {DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {ChatWrapper} from "../../ChatWrapper.js";
 import {LlamaContextSequence} from "../LlamaContext/LlamaContext.js";
-import {ChatHistoryItem, ChatModelFunctions, ChatModelResponse, LLamaContextualRepeatPenalty, Token, Tokenizer} from "../../types.js";
+import {
+    ChatHistoryItem, ChatModelFunctions, ChatModelResponse, ChatUserMessage, LLamaContextualRepeatPenalty, Token, Tokenizer
+} from "../../types.js";
 import {GbnfJsonSchemaToType} from "../../utils/gbnfJson/types.js";
 import {LlamaGrammar} from "../LlamaGrammar.js";
 import {removeNullFields} from "../../utils/removeNullFields.js";
@@ -90,7 +92,8 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
 
     /**
      * Trim whitespace from the end of the generated text
-     * Disabled by default.
+     *
+     * Defaults to `false`.
      */
     trimWhitespaceSuffix?: boolean,
 
@@ -127,6 +130,8 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
          * Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
          * If the last evaluation context window is not used, a new context will be generated based on the full history,
          * which will decrease the likelihood of another context shift happening so soon.
+         *
+         * A number between `0` (exclusive) and `1` (inclusive).
          */
         minimumOverlapPercentageToPreventContextShift?: number
     }
@@ -140,6 +145,56 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
     documentFunctionParams?: boolean
 });
 
+export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
+    /**
+     * Complete the given user prompt without adding it or the completion to the returned context window.
+     */
+    initialUserPrompt?: string,
+
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean,
+
+    onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"],
+    signal?: LLamaChatGenerateResponseOptions<Functions>["signal"],
+    maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"],
+    temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"],
+    minP?: LLamaChatGenerateResponseOptions<Functions>["minP"],
+    topK?: LLamaChatGenerateResponseOptions<Functions>["topK"],
+    topP?: LLamaChatGenerateResponseOptions<Functions>["topP"],
+    trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"],
+    repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"],
+    tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"],
+    evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"],
+    contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"],
+    customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"],
+    lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"],
+
+    grammar?: LlamaGrammar,
+
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same functions that were used for the previous prompt here.
+     */
+    functions?: Functions | ChatModelFunctions,
+
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same value that was used for the previous prompt here.
+     */
+    documentFunctionParams?: boolean
+};
+
 export type LLamaChatContextShiftOptions = {
     /**
      * The number of tokens to delete from the context window to make space for new ones.
@@ -175,12 +230,15 @@ const defaultContextShiftOptions: Required<LLamaChatContextShiftOptions> = {
     lastEvaluationMetadata: null
 };
 const defaultRepeatPenaltyLastTokens = 64;
+const defaultTrimWhitespaceSuffix = false;
+const defaultEvaluationPriority: EvaluationPriority = 5;
 
 
 export class LlamaChat {
     /** @internal */ private readonly _chatWrapper: ChatWrapper;
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
+    /** @internal */ private readonly _chatLock = {};
     /** @internal */ private _sequence: LlamaContextSequence | null;
     public readonly onDispose = new EventRelay<void>();
 
@@ -264,13 +322,7 @@ export class LlamaChat {
         history: ChatHistoryItem[],
         options: LLamaChatGenerateResponseOptions<Functions> = {}
     ): Promise<LlamaChatResponse<Functions>> {
-        return this._generateResponse(history, options);
-    }
-
-    /** @internal */
-    private async _generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(
-        history: ChatHistoryItem[],
-        {
+        const {
             onToken,
             signal,
             stopOnAbortSignal = false,
@@ -280,10 +332,10 @@ export class LlamaChat {
             topK,
             topP,
             grammar,
-            trimWhitespaceSuffix = false,
+            trimWhitespaceSuffix = defaultTrimWhitespaceSuffix,
             repeatPenalty = {},
             tokenBias,
-            evaluationPriority = 5,
+            evaluationPriority = defaultEvaluationPriority,
             functions,
             documentFunctionParams,
             contextShift = defaultContextShiftOptions,
@@ -292,8 +344,8 @@ export class LlamaChat {
                 history: lastEvaluationContextWindowHistory,
                 minimumOverlapPercentageToPreventContextShift = 0.5
             } = {}
-        }: LLamaChatGenerateResponseOptions<Functions> = {}
-    ): Promise<LlamaChatResponse<Functions>> {
+        } = options;
+
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -307,7 +359,7 @@ export class LlamaChat {
                 minP,
                 topK,
                 topP,
-                grammar: grammar as never,
+                grammar: grammar as undefined, // this is a workaround to allow passing both `functions` and `grammar`
                 trimWhitespaceSuffix,
                 repeatPenalty,
                 tokenBias,
@@ -323,77 +375,263 @@ export class LlamaChat {
             }
         );
 
-        try {
-            generateResponseState.ensureLastHistoryItemIsModel();
+        if (generateResponseState.grammar != null && generateResponseState.functionsEnabled)
+            throw new Error("Using both grammar and functions is not supported yet");
+
+        return await withLock(this._chatLock, "evaluate", signal, async (): Promise<LlamaChatResponse<Functions>> => {
+            try {
+                generateResponseState.ensureLastHistoryItemIsModel();
 
-            // eslint-disable-next-line no-constant-condition
-            while (true) {
-                generateResponseState.startTokenLoop();
-                await generateResponseState.loadContextWindow();
+                // eslint-disable-next-line no-constant-condition
+                while (true) {
+                    generateResponseState.startTokenLoop();
+                    await generateResponseState.loadContextWindow(
+                        generateResponseState.getResolvedHistoryWithCurrentModelResponse(),
+                        false
+                    );
 
-                if (generateResponseState.generatedTokens === 0) {
-                    generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
-                    generateResponseState.addFunctionSyntaxEndTriggersFromFunctionsGrammar();
+                    if (generateResponseState.generatedTokens === 0) {
+                        generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
+                        generateResponseState.addFunctionSyntaxEndTriggersFromFunctionsGrammar();
 
-                    if (generateResponseState.functionsEnabled) {
-                        generateResponseState.initFunctions();
+                        if (generateResponseState.functionsEnabled) {
+                            generateResponseState.initFunctions();
+                        }
                     }
-                }
 
-                generateResponseState.addStopGenerationTriggersFromChatWrapper();
-                await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
+                    generateResponseState.addStopGenerationTriggersFromChatWrapper();
+                    await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
+
+                    await generateResponseState.createNewEvaluationIterator();
+                    while (await generateResponseState.iterateEvaluation()) {
+                        generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
+
+                        generateResponseState.trackGenerationForDisengageInitiallyEngagedFunctionMode();
+                        generateResponseState.trackFunctionSyntaxStart();
 
-                await generateResponseState.createNewEvaluationIterator();
-                while (await generateResponseState.iterateEvaluation()) {
-                    generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
+                        generateResponseState.handleInitiallyEngagedFunctionModeFunctionDetection();
+                        generateResponseState.handleFunctionSyntax();
 
-                    generateResponseState.trackGenerationForDisengageInitiallyEngagedFunctionMode();
-                    generateResponseState.trackFunctionSyntaxStart();
+                        const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax();
+                        if (functionEndSyntaxRes != null)
+                            return functionEndSyntaxRes;
 
-                    generateResponseState.handleInitiallyEngagedFunctionModeFunctionDetection();
-                    generateResponseState.handleFunctionSyntax();
+                        generateResponseState.recordStopGenerationEvaluation();
 
-                    const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax();
-                    if (functionEndSyntaxRes != null)
-                        return functionEndSyntaxRes;
+                        generateResponseState.popStreamRegulatorFreeTokens();
+                        generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
 
-                    generateResponseState.recordStopGenerationEvaluation();
+                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
+                        if (stopGenerationTriggerRes != null)
+                            return stopGenerationTriggerRes;
 
-                    generateResponseState.popStreamRegulatorFreeTokens();
-                    generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
+                        generateResponseState.spliceIgnoreStartTextDetectedTokens();
 
-                    const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
-                    if (stopGenerationTriggerRes != null)
-                        return stopGenerationTriggerRes;
+                        generateResponseState.moveFreePendingTokensToRes();
 
-                    generateResponseState.spliceIgnoreStartTextDetectedTokens();
+                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
+                        if (maxTokensTriggerRes != null)
+                            return maxTokensTriggerRes;
 
-                    generateResponseState.moveFreePendingTokensToRes();
+                        if (generateResponseState.updateShouldContextShift())
+                            break;
 
-                    const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
-                    if (maxTokensTriggerRes != null)
-                        return maxTokensTriggerRes;
+                        const abortRes = generateResponseState.handleAbortTrigger();
+                        if (abortRes != null)
+                            return abortRes;
+                    }
+
+                    generateResponseState.isFirstEvaluation = false;
 
-                    if (generateResponseState.updateShouldContextShift())
-                        break;
+                    if (generateResponseState.shouldContextShift)
+                        continue;
 
-                    const abortRes = generateResponseState.handleAbortTrigger();
-                    if (abortRes != null)
-                        return abortRes;
+                    break;
                 }
 
-                generateResponseState.isFirstEvaluation = false;
+                throw new Error("The context size is too small to generate a response");
+            } finally {
+                generateResponseState.dispose();
+            }
+        });
+    }
 
-                if (generateResponseState.shouldContextShift)
-                    continue;
+    public async loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(
+        history: ChatHistoryItem[],
+        options: LLamaChatLoadAndCompleteUserMessageOptions<Functions> = {}
+    ): Promise<LlamaChatLoadAndCompleteUserResponse> {
+        const {
+            initialUserPrompt = "",
+            stopOnAbortSignal = false,
+            onToken,
+            signal,
+            maxTokens = Math.min(256, Math.ceil(this.context.contextSize / 2)),
+            temperature,
+            minP,
+            topK,
+            topP,
+            grammar,
+            trimWhitespaceSuffix = defaultTrimWhitespaceSuffix,
+            repeatPenalty = {},
+            tokenBias,
+            evaluationPriority = defaultEvaluationPriority,
+            functions,
+            documentFunctionParams,
+            contextShift = defaultContextShiftOptions,
+            customStopTriggers,
+            lastEvaluationContextWindow: {
+                history: lastEvaluationContextWindowHistory,
+                minimumOverlapPercentageToPreventContextShift = 0.8
+            } = {}
+        } = options;
 
-                break;
+        const generateResponseState = new GenerateResponseState<Functions>(
+            this,
+            this._chatWrapper,
+            history,
+            {
+                onToken,
+                signal,
+                stopOnAbortSignal,
+                maxTokens,
+                temperature,
+                minP,
+                topK,
+                topP,
+                grammar: grammar as undefined, // this is a workaround to allow passing both `functions` and `grammar`
+                trimWhitespaceSuffix,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                functions,
+                documentFunctionParams,
+                contextShift,
+                customStopTriggers,
+                lastEvaluationContextWindow: {
+                    history: lastEvaluationContextWindowHistory,
+                    minimumOverlapPercentageToPreventContextShift
+                }
             }
+        );
 
-            throw new Error("The context size is too small to generate a response");
-        } finally {
-            generateResponseState.dispose();
-        }
+        return await withLock(this._chatLock, "evaluate", signal, async (): Promise<LlamaChatLoadAndCompleteUserResponse> => {
+            try {
+                generateResponseState.ensureLastHistoryItemIsUser();
+                const lastResolvedHistoryItem = generateResponseState.resolvedHistory[generateResponseState.resolvedHistory.length - 1];
+                const initialUserMessage = lastResolvedHistoryItem?.type === "user"
+                    ? lastResolvedHistoryItem.text
+                    : "";
+
+                // eslint-disable-next-line no-constant-condition
+                while (true) {
+                    generateResponseState.startTokenLoop();
+                    const {userTextSuffix} = await generateResponseState.loadContextWindow(
+                        setLastUserTextInChatHistory(
+                            generateResponseState.resolvedHistory,
+                            initialUserMessage + initialUserPrompt + this.model.detokenize(generateResponseState.res)
+                        ),
+                        true
+                    );
+                    generateResponseState.inFunctionEvaluationMode = false;
+
+                    generateResponseState.addStopGenerationTriggersFromChatWrapper();
+
+                    if (userTextSuffix != null && userTextSuffix.values.length > 0)
+                        generateResponseState.stopGenerationDetector.addStopTrigger(
+                            StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer)
+                        );
+
+                    await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
+
+                    if (generateResponseState.maxTokens === 0) {
+                        await generateResponseState.evaluateWithoutGeneratingNewTokens();
+
+                        return {
+                            completion: "",
+                            lastEvaluation: {
+                                contextWindow: setLastUserTextInChatHistory(
+                                    generateResponseState.contextWindowHistory,
+                                    initialUserMessage
+                                ),
+                                contextShiftMetadata: generateResponseState.lastHistoryCompressionMetadata
+                            },
+                            metadata: {
+                                stopReason: "maxTokens"
+                            }
+                        };
+                    }
+
+                    await generateResponseState.createNewEvaluationIterator();
+                    while (await generateResponseState.iterateEvaluation()) {
+                        generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
+
+                        generateResponseState.recordStopGenerationEvaluation();
+
+                        generateResponseState.popStreamRegulatorFreeTokens();
+
+                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
+                        if (stopGenerationTriggerRes != null)
+                            return {
+                                completion: stopGenerationTriggerRes.response,
+                                lastEvaluation: {
+                                    contextWindow: setLastUserTextInChatHistory(
+                                        generateResponseState.contextWindowHistory,
+                                        initialUserMessage
+                                    ),
+                                    contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata
+                                },
+                                metadata: stopGenerationTriggerRes.metadata.stopReason === "customStopTrigger"
+                                    ? stopGenerationTriggerRes.metadata
+                                    : stopGenerationTriggerRes.metadata
+                            };
+
+                        generateResponseState.moveFreePendingTokensToRes(false);
+
+                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
+                        if (maxTokensTriggerRes != null)
+                            return {
+                                completion: maxTokensTriggerRes.response,
+                                lastEvaluation: {
+                                    contextWindow: setLastUserTextInChatHistory(
+                                        generateResponseState.contextWindowHistory,
+                                        initialUserMessage
+                                    ),
+                                    contextShiftMetadata: maxTokensTriggerRes.lastEvaluation.contextShiftMetadata
+                                },
+                                metadata: maxTokensTriggerRes.metadata
+                            };
+
+                        if (generateResponseState.updateShouldContextShift())
+                            break;
+
+                        const abortRes = generateResponseState.handleAbortTrigger();
+                        if (abortRes != null)
+                            return {
+                                completion: abortRes.response,
+                                lastEvaluation: {
+                                    contextWindow: setLastUserTextInChatHistory(
+                                        generateResponseState.contextWindowHistory,
+                                        initialUserMessage
+                                    ),
+                                    contextShiftMetadata: abortRes.lastEvaluation.contextShiftMetadata
+                                },
+                                metadata: abortRes.metadata
+                            };
+                    }
+
+                    generateResponseState.isFirstEvaluation = false;
+
+                    if (generateResponseState.shouldContextShift)
+                        continue;
+
+                    break;
+                }
+
+                throw new Error("The context size is too small to generate a completion");
+            } finally {
+                generateResponseState.dispose();
+            }
+        });
     }
 }
 
@@ -429,6 +667,26 @@ export type LlamaChatResponseFunctionCall<
     raw: string
 };
 
+export type LlamaChatLoadAndCompleteUserResponse = {
+    completion: string,
+    lastEvaluation: {
+        /**
+         * The completion and initial user prompt are not added to this context window result,
+         * but are loaded to the current context sequence state as tokens
+         */
+        contextWindow: ChatHistoryItem[],
+        contextShiftMetadata: any
+    },
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[],
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort"
+    } | {
+        remainingGenerationAfterStop?: string | Token[],
+        stopReason: "customStopTrigger",
+        customStopTrigger: (string | Token)[]
+    }
+};
+
 function removeRawFromHistoryItem<Item extends ChatHistoryItem>(historyItem: Item): Item {
     if (historyItem.type === "model") {
         const newHistoryItem: ChatModelResponse = {...historyItem};
@@ -559,6 +817,13 @@ function getLastTextModelResponseFromChatHistory(chatHistory: ChatHistoryItem[])
     return "";
 }
 
+function getLastUserTextFromChatHistory(chatHistory: ChatHistoryItem[]) {
+    if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "user")
+        return "";
+
+    return (chatHistory[chatHistory.length - 1] as ChatUserMessage).text;
+}
+
 function setLastModelTextResponseInChatHistory(chatHistory: ChatHistoryItem[], textResponse: string) {
     const newChatHistory = chatHistory.slice();
     if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
@@ -585,22 +850,96 @@ function setLastModelTextResponseInChatHistory(chatHistory: ChatHistoryItem[], t
     return newChatHistory;
 }
 
+function setLastUserTextInChatHistory(chatHistory: ChatHistoryItem[], textResponse: string) {
+    const newChatHistory = chatHistory.slice();
+    if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user")
+        newChatHistory.push({
+            type: "user",
+            text: ""
+        });
+
+    const lastUserItem = newChatHistory[newChatHistory.length - 1] as ChatUserMessage;
+    const newLastUserItem = {...lastUserItem};
+    newChatHistory[newChatHistory.length - 1] = newLastUserItem;
+
+    newLastUserItem.text = textResponse;
+
+    return newChatHistory;
+}
+
+function generateContextText(
+    endWithUserText: boolean,
+    chatWrapper: ChatWrapper,
+    chatHistory: ChatHistoryItem[],
+    options?: Parameters<typeof chatWrapper.generateContextText>[1]
+): ReturnType<typeof generateContextTextThatEndsWithUserText> {
+    if (endWithUserText)
+        return generateContextTextThatEndsWithUserText(chatWrapper, chatHistory, options);
+
+    return chatWrapper.generateContextText(chatHistory, options);
+}
+
+function generateContextTextThatEndsWithUserText(
+    chatWrapper: ChatWrapper, chatHistory: ChatHistoryItem[], options?: Parameters<typeof chatWrapper.generateContextText>[1]
+): ReturnType<typeof chatWrapper.generateContextText> & {
+    userTextSuffix?: LlamaText
+} {
+    const lastUserText = getLastUserTextFromChatHistory(chatHistory);
+    const randomId = "W" + (Math.random()
+        .toString(36)
+        .slice(2)) + "W";
+    const {contextText, ...rest} = chatWrapper.generateContextText(
+        setLastUserTextInChatHistory(chatHistory, lastUserText + randomId),
+        options
+    );
+    let newContextText = contextText;
+
+    for (let i = 0; i < newContextText.values.length; i++) {
+        const item = newContextText.values[i];
+        if (typeof item !== "string")
+            continue;
+
+        const randomTextIndex = item.indexOf(randomId);
+        if (randomTextIndex < 0)
+            continue;
+
+        const newValue = item.slice(0, randomTextIndex);
+        newContextText = LlamaText([
+            ...newContextText.values.slice(0, i),
+            newValue
+        ]);
+        return {
+            contextText: newContextText,
+            userTextSuffix: LlamaText([
+                item.slice(randomTextIndex + randomId.length),
+                ...newContextText.values.slice(i + 1)
+            ]),
+            ...rest
+        };
+    }
+
+    throw new Error("The random ID was not found in the context text. " +
+        `There might be an issue with the chat wrapper "${chatWrapper.wrapperName}" ` +
+        "where not all user messages are properly added to the the result LlamaText"
+    );
+}
+
 async function getContextWindow({
     resolvedHistory, resolvedContextShift,
     lastHistoryCompressionMetadata, pendingTokensCount = 0, isFirstEvaluation,
     chatWrapper, lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift,
-    sequence, minFreeContextTokens = 1, functions, documentFunctionParams
+    sequence, minFreeContextTokens = 1, functions, documentFunctionParams, endWithUserText
 }: {
     resolvedHistory: ChatHistoryItem[], resolvedContextShift: Required<LLamaChatContextShiftOptions>,
     lastHistoryCompressionMetadata: object | null | undefined, pendingTokensCount: number, isFirstEvaluation: boolean,
     chatWrapper: ChatWrapper, lastEvaluationContextWindowHistory?: ChatHistoryItem[], minimumOverlapPercentageToPreventContextShift: number,
     sequence?: LlamaContextSequence, minFreeContextTokens?: number, functions?: ChatModelFunctions,
-    documentFunctionParams?: boolean
+    documentFunctionParams?: boolean, endWithUserText: boolean
 }): Promise<{
     history: ChatHistoryItem[], stopGenerationTriggers: LlamaText[], tokens: Token[],
     newResolvedHistory: ChatHistoryItem[], newHistoryCompressionMetadata: object | null | undefined,
     ignoreStartText: LlamaText[], functionCallInitiallyEngaged: boolean,
-    disengageInitiallyEngagedFunctionCall: LlamaText[]
+    disengageInitiallyEngagedFunctionCall: LlamaText[], userTextSuffix?: LlamaText
 }> {
     if (sequence == null)
         throw new DisposedError();
@@ -617,10 +956,15 @@ async function getContextWindow({
                 response: []
             });
 
-        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall} = chatWrapper.generateContextText(newContextWindow, {
-            availableFunctions: functions,
-            documentFunctionParams
-        });
+        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
+            endWithUserText,
+            chatWrapper,
+            newContextWindow,
+            {
+                availableFunctions: functions,
+                documentFunctionParams
+            }
+        );
         const tokens = contextText.tokenize(model.tokenizer);
         if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) {
             const {firstDifferentIndex} = sequence.compareContextTokens(tokens);
@@ -636,7 +980,8 @@ async function getContextWindow({
                     newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
                     ignoreStartText: ignoreStartText ?? [],
                     functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
-                    disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? []
+                    disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
+                    userTextSuffix
                 };
         }
     }
@@ -665,10 +1010,15 @@ async function getContextWindow({
             documentFunctionParams
         });
 
-        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall} = chatWrapper.generateContextText(compressedHistory, {
-            availableFunctions: functions,
-            documentFunctionParams
-        });
+        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
+            endWithUserText,
+            chatWrapper,
+            compressedHistory,
+            {
+                availableFunctions: functions,
+                documentFunctionParams
+            }
+        );
 
         return {
             history: compressedHistory,
@@ -678,15 +1028,21 @@ async function getContextWindow({
             newHistoryCompressionMetadata: metadata,
             ignoreStartText: ignoreStartText ?? [],
             functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
-            disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? []
+            disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
+            userTextSuffix
         };
     }
 
     {
-        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall} = chatWrapper.generateContextText(resolvedHistory, {
-            availableFunctions: functions,
-            documentFunctionParams
-        });
+        const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
+            endWithUserText,
+            chatWrapper,
+            resolvedHistory,
+            {
+                availableFunctions: functions,
+                documentFunctionParams
+            }
+        );
         const tokens = contextText.tokenize(model.tokenizer);
 
         if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize)
@@ -698,7 +1054,8 @@ async function getContextWindow({
                 newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
                 ignoreStartText: ignoreStartText ?? [],
                 functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
-                disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? []
+                disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
+                userTextSuffix
             };
     }
 
@@ -729,10 +1086,15 @@ async function getContextWindow({
         documentFunctionParams
     });
 
-    const {contextText, stopGenerationTriggers, ignoreStartText, functionCall} = chatWrapper.generateContextText(compressedHistory, {
-        availableFunctions: functions,
-        documentFunctionParams
-    });
+    const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
+        endWithUserText,
+        chatWrapper,
+        compressedHistory,
+        {
+            availableFunctions: functions,
+            documentFunctionParams
+        }
+    );
 
     return {
         history: compressedHistory,
@@ -742,7 +1104,8 @@ async function getContextWindow({
         newHistoryCompressionMetadata: metadata,
         ignoreStartText: ignoreStartText ?? [],
         functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
-        disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? []
+        disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
+        userTextSuffix
     };
 }
 
@@ -754,12 +1117,12 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private readonly onToken: LLamaChatGenerateResponseOptions<Functions>["onToken"];
     private readonly signal: LLamaChatGenerateResponseOptions<Functions>["signal"];
     private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions<Functions>["stopOnAbortSignal"];
-    private readonly maxTokens: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
+    public readonly maxTokens: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
     private readonly temperature: LLamaChatGenerateResponseOptions<Functions>["temperature"];
     private readonly minP: LLamaChatGenerateResponseOptions<Functions>["minP"];
     private readonly topK: LLamaChatGenerateResponseOptions<Functions>["topK"];
     private readonly topP: LLamaChatGenerateResponseOptions<Functions>["topP"];
-    private readonly grammar: LLamaChatGenerateResponseOptions<Functions>["grammar"];
+    public readonly grammar: LLamaChatGenerateResponseOptions<Functions>["grammar"];
     private readonly trimWhitespaceSuffix: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
     private readonly tokenBias: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
     private readonly evaluationPriority: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
@@ -782,7 +1145,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private functionsEvaluationState: LlamaGrammarEvaluationState | undefined;
 
     private readonly streamRegulator = new TokenStreamRegulator();
-    private readonly stopGenerationDetector = new StopGenerationDetector();
+    public readonly stopGenerationDetector = new StopGenerationDetector();
     private readonly customStopGenerationTriggersDetector = new StopGenerationDetector();
     private readonly functionSyntaxStartDetector = new StopGenerationDetector();
     private readonly functionSyntaxEndDetector = new StopGenerationDetector();
@@ -845,10 +1208,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             topK,
             topP,
             grammar,
-            trimWhitespaceSuffix = false,
+            trimWhitespaceSuffix = defaultTrimWhitespaceSuffix,
             repeatPenalty = {},
             tokenBias,
-            evaluationPriority = 5,
+            evaluationPriority = defaultEvaluationPriority,
             functions,
             documentFunctionParams,
             contextShift = defaultContextShiftOptions,
@@ -884,9 +1247,6 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
 
         this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0);
 
-        if (this.grammar != null && this.functionsEnabled)
-            throw new Error("Using both grammar and functions is not supported yet");
-
         if (this.signal?.aborted)
             throw this.signal.reason;
 
@@ -1061,7 +1421,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
     }
 
-    public async loadContextWindow() {
+    public async loadContextWindow(resolvedHistory: ChatHistoryItem[], endWithUserText: boolean = false) {
         const {
             history: contextWindowHistory,
             stopGenerationTriggers,
@@ -1070,9 +1430,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             newHistoryCompressionMetadata,
             ignoreStartText,
             functionCallInitiallyEngaged,
-            disengageInitiallyEngagedFunctionCall
+            disengageInitiallyEngagedFunctionCall,
+            userTextSuffix
         } = await getContextWindow({
-            resolvedHistory: this.getResolvedHistoryWithCurrentModelResponse(),
+            resolvedHistory: resolvedHistory,
             resolvedContextShift: this.resolvedContextShift,
             lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
             pendingTokensCount: this.ignoredStartTextTokens.length + this.pendingTokens.length + this.queuedChunkTokens.length,
@@ -1083,7 +1444,8 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             sequence: this.llamaChat.sequence,
             minFreeContextTokens: 1,
             functions: this.functionsEnabled ? this.functions : undefined,
-            documentFunctionParams: this.documentFunctionParams
+            documentFunctionParams: this.documentFunctionParams,
+            endWithUserText
         });
 
         this.contextWindowHistory = contextWindowHistory;
@@ -1103,6 +1465,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.lastContextWindowHistory = this.contextWindowHistory;
         this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(this.contextWindowHistory);
         this.contextWindowsRes = [];
+
+        return {
+            userTextSuffix
+        };
     }
 
     public addIgnoreStartTextTriggersFromChatWrapper() {
@@ -1158,6 +1524,15 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         }
     }
 
+    public async evaluateWithoutGeneratingNewTokens() {
+        if (this.evaluationIterator != null)
+            await this.evaluationIterator.return();
+
+        await this.llamaChat.sequence.evaluateWithoutGeneratingNewTokens(this.tokens, removeNullFields({
+            evaluationPriority: this.evaluationPriority
+        }));
+    }
+
     public async createNewEvaluationIterator() {
         if (this.evaluationIterator != null)
             await this.evaluationIterator.return();
@@ -1182,7 +1557,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             },
             tokenBias: this.tokenBias,
             evaluationPriority: this.evaluationPriority,
-            yieldEogToken: true
+            yieldEogToken: true,
         }));
     }
 
@@ -1444,7 +1819,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
     }
 
-    public handleStopGenerationTrigger(): LlamaChatResponse<Functions> | undefined {
+    public handleStopGenerationTrigger() {
         if (this.stopGenerationDetector.hasTriggeredStops || this.customStopGenerationTriggersDetector.hasTriggeredStops ||
             this.llamaChat.model.isEogToken(this.currentToken)
         ) {
@@ -1510,7 +1885,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                             ? "eogToken"
                             : "stopGenerationTrigger"
                     }
-                };
+                } satisfies LlamaChatResponse<Functions>;
             }
 
             return {
@@ -1521,7 +1896,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                     stopReason: "customStopTrigger",
                     customStopTrigger: triggeredStops[0].stopTrigger
                 }
-            };
+            } satisfies LlamaChatResponse<Functions>;
         }
 
         return undefined;
@@ -1543,9 +1918,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return this.maxTokens != null && this.maxTokens > 0 && this.generatedTokens >= this.maxTokens;
     }
 
-    public moveFreePendingTokensToRes() {
+    public moveFreePendingTokensToRes(removeFoundStartIgnoreTextsFromPendingTokens: boolean = true) {
         if (this.pendingTokens.length > 0 && (this.isMaxTokensTriggered() || !this.ignoreStartTextDetector.hasInProgressStops)) {
-            this.removeFoundStartIgnoreTextsFromPendingTokens();
+            if (removeFoundStartIgnoreTextsFromPendingTokens)
+                this.removeFoundStartIgnoreTextsFromPendingTokens();
 
             if (this.pendingTokens.length > 0) {
                 this.onToken?.(this.pendingTokens.slice());
@@ -1556,7 +1932,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         }
     }
 
-    public handleMaxTokensTrigger(): LlamaChatResponse<Functions> | undefined {
+    public handleMaxTokensTrigger() {
         if (this.isMaxTokensTriggered()) {
             let modelResponse = this.llamaChat.model.detokenize(this.res);
             let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
@@ -1582,7 +1958,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 metadata: {
                     stopReason: "maxTokens"
                 }
-            };
+            } satisfies LlamaChatResponse<Functions>;
         }
 
         return undefined;
@@ -1593,7 +1969,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return this.shouldContextShift;
     }
 
-    public handleAbortTrigger(): LlamaChatResponse<Functions> | undefined {
+    public handleAbortTrigger() {
         if (this.signal?.aborted && this.stopOnAbortSignal) {
             if (this.res.length === 0)
                 throw this.signal.reason;
@@ -1622,7 +1998,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 metadata: {
                     stopReason: "abort"
                 }
-            };
+            } satisfies LlamaChatResponse<Functions>;
         }
 
         return undefined;
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 1c8a2492..315ea186 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -130,6 +130,55 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
     documentFunctionParams?: boolean
 });
 
+export type LLamaChatPreloadPromptOptions = {
+    /**
+     * If set to a value greater than `0`, a completion for the given user prompt will be generated up to the given number of tokens.
+     *
+     * Defaults to `0`.
+     */
+    maxTokens?: LLamaChatPromptOptions["maxTokens"],
+
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"],
+
+    onToken?: LLamaChatPromptOptions["onToken"],
+    signal?: LLamaChatPromptOptions["signal"],
+    temperature?: LLamaChatPromptOptions["temperature"],
+    minP?: LLamaChatPromptOptions["minP"],
+    topK?: LLamaChatPromptOptions["topK"],
+    topP?: LLamaChatPromptOptions["topP"],
+    trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"],
+    evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"],
+    repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"],
+    tokenBias?: LLamaChatPromptOptions["tokenBias"],
+    customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"],
+
+    grammar?: LlamaGrammar,
+
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same functions that were used for the previous prompt here.
+     */
+    functions?: ChatSessionModelFunctions,
+
+    /**
+     * Functions are not used by the model here,
+     * but are used for keeping the instructions given to the model about the functions in the current context state,
+     * to avoid context shifts.
+     *
+     * It's best to provide the same value that was used for the previous prompt here.
+     */
+    documentFunctionParams?: boolean
+};
+
 export type LlamaChatSessionRepeatPenalty = {
     /**
      * Number of recent tokens generated by the model to apply penalties to repetition of.
@@ -171,6 +220,7 @@ export class LlamaChatSession {
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
     /** @internal */ private readonly _contextShift?: LlamaChatSessionContextShiftOptions;
+    /** @internal */ private readonly _chatLock = {};
     /** @internal */ private _chatHistory: ChatHistoryItem[];
     /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
     /** @internal */ private _chat: LlamaChat | null;
@@ -316,7 +366,7 @@ export class LlamaChatSession {
         if (grammar != null && grammar._llama !== this.model._llama)
             throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
 
-        return await withLock(this, "prompt", signal, async () => {
+        return await withLock(this._chatLock, "evaluation", signal, async () => {
             this._ensureNotDisposed();
 
             if (this._chat == null)
@@ -368,7 +418,7 @@ export class LlamaChatSession {
                     evaluationPriority,
                     lastEvaluationContextWindow: {
                         history: newContextWindowChatHistory,
-                        minimumOverlapPercentageToPreventContextShift: 0.01
+                        minimumOverlapPercentageToPreventContextShift: 0.5
                     }
                 });
                 this._ensureNotDisposed();
@@ -443,6 +493,125 @@ export class LlamaChatSession {
         });
     }
 
+    /**
+     * Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
+     * and feel faster.
+     *
+     * If `maxTokens` is set to a value greater than `0`,
+     * a completion for the given user prompt will be generated up to the given number of tokens.
+     *
+     * > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
+     * > so consider limiting the length of prompts you preload.
+     * >
+     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount.
+     *
+     * Defaults to `0`.
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    public async preloadPrompt<const MaxTokens extends number | 0 | undefined = 0>(
+        prompt: string,
+        options: LLamaChatPreloadPromptOptions & {
+            maxTokens?: MaxTokens
+        } = {}
+    ): Promise<0 | undefined extends MaxTokens ? void : string> {
+        const {completion} = await this.preloadPromptWithMeta(prompt, options);
+
+        if (options?.maxTokens == null || options?.maxTokens === 0)
+            return undefined as (0 | undefined extends MaxTokens ? void : string);
+
+        return completion as (0 | undefined extends MaxTokens ? void : string);
+    }
+
+    /**
+     * See `preloadPrompt` for more information.
+     * @param prompt
+     * @param [options]
+     */
+    public async preloadPromptWithMeta(prompt: string, {
+        maxTokens = 0,
+        stopOnAbortSignal = false,
+
+        functions,
+        documentFunctionParams,
+        onToken,
+        signal,
+        temperature,
+        minP,
+        topK,
+        topP,
+        grammar,
+        trimWhitespaceSuffix = false,
+        repeatPenalty,
+        tokenBias,
+        customStopTriggers,
+        evaluationPriority
+    }: LLamaChatPreloadPromptOptions = {}) {
+        this._ensureNotDisposed();
+
+        if (grammar != null && grammar._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+
+        return await withLock(this._chatLock, "evaluation", signal, async () => {
+            this._ensureNotDisposed();
+
+            if (this._chat == null)
+                throw new DisposedError();
+
+            const {completion, lastEvaluation, metadata} = await this._chat.loadChatAndCompleteUserMessage(this._chatHistory, {
+                initialUserPrompt: prompt,
+                functions,
+                documentFunctionParams,
+                grammar,
+                onToken,
+                signal,
+                stopOnAbortSignal: true,
+                repeatPenalty,
+                minP,
+                topK,
+                topP,
+                tokenBias,
+                customStopTriggers,
+                maxTokens,
+                temperature,
+                trimWhitespaceSuffix,
+                contextShift: {
+                    ...this._contextShift,
+                    lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
+                },
+                evaluationPriority,
+                lastEvaluationContextWindow: {
+                    history: this._lastEvaluation?.contextWindow,
+                    minimumOverlapPercentageToPreventContextShift: 0.8
+                }
+            });
+            this._ensureNotDisposed();
+
+            this._lastEvaluation = {
+                cleanHistory: this._chatHistory,
+                contextWindow: lastEvaluation.contextWindow,
+                contextShiftMetadata: lastEvaluation.contextShiftMetadata
+            };
+
+            if (!stopOnAbortSignal && metadata.stopReason === "abort" && signal?.aborted)
+                throw signal.reason;
+
+            if (metadata.stopReason === "customStopTrigger")
+                return {
+                    completion: completion,
+                    stopReason: metadata.stopReason,
+                    customStopTrigger: metadata.customStopTrigger,
+                    remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                };
+
+            return {
+                completion: completion,
+                stopReason: metadata.stopReason,
+                remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+            };
+        });
+    }
+
     public getChatHistory() {
         return structuredClone(this._chatHistory);
     }
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index d811ff7f..8a8763af 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -861,8 +861,6 @@ export class LlamaContextSequence {
             strategy: contextShiftStrategy = this._contextShift.strategy
         } = {}
     }: {
-        grammarEvaluationState?: LlamaGrammarEvaluationState,
-
         /**
          * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
          * evaluated based on the strategy chosen for the context.
diff --git a/src/index.ts b/src/index.ts
index 9fe6c331..f2f8f25b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -18,12 +18,12 @@ import {
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
     LlamaChatSession, type LlamaChatSessionOptions, type LlamaChatSessionContextShiftOptions,
-    type LLamaChatPromptOptions, type LlamaChatSessionRepeatPenalty
+    type LLamaChatPromptOptions, type LLamaChatPreloadPromptOptions, type LlamaChatSessionRepeatPenalty
 } from "./evaluator/LlamaChatSession/LlamaChatSession.js";
 import {defineChatSessionFunction} from "./evaluator/LlamaChatSession/utils/defineChatSessionFunction.js";
 import {
-    LlamaChat, type LlamaChatOptions, type LLamaChatGenerateResponseOptions, type LLamaChatContextShiftOptions,
-    type LlamaChatResponse, type LlamaChatResponseFunctionCall
+    LlamaChat, type LlamaChatOptions, type LLamaChatGenerateResponseOptions, type LLamaChatLoadAndCompleteUserMessageOptions,
+    type LLamaChatContextShiftOptions, type LlamaChatResponse, type LlamaChatResponseFunctionCall, type LlamaChatLoadAndCompleteUserResponse
 } from "./evaluator/LlamaChat/LlamaChat.js";
 import {
     LlamaCompletion, type LlamaCompletionOptions, type LlamaCompletionGenerationOptions, type LlamaInfillGenerationOptions
@@ -116,14 +116,17 @@ export {
     type LlamaChatSessionOptions,
     type LlamaChatSessionContextShiftOptions,
     type LLamaChatPromptOptions,
+    type LLamaChatPreloadPromptOptions,
     type LlamaChatSessionRepeatPenalty,
     LlamaChat,
     type LlamaChatOptions,
     type LLamaChatGenerateResponseOptions,
+    type LLamaChatLoadAndCompleteUserMessageOptions,
     type LLamaChatContextShiftOptions,
     type LLamaContextualRepeatPenalty,
     type LlamaChatResponse,
     type LlamaChatResponseFunctionCall,
+    type LlamaChatLoadAndCompleteUserResponse,
     LlamaCompletion,
     type LlamaCompletionOptions,
     type LlamaCompletionGenerationOptions,
diff --git a/test/modelDependent/llama3/chatSession.test.ts b/test/modelDependent/llama3/chatSession.test.ts
index 4e545edf..b12043fd 100644
--- a/test/modelDependent/llama3/chatSession.test.ts
+++ b/test/modelDependent/llama3/chatSession.test.ts
@@ -63,6 +63,50 @@ describe("llama 3", () => {
             expect(res.responseText.toLowerCase()).to.not.include("llama");
         });
 
+        test("preloading a prompt works", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 2048
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            expect(chatSession.chatWrapper).to.be.an.instanceof(Llama3ChatWrapper);
+
+            const prompt = "Describe the appearance of a llama";
+            await chatSession.preloadPrompt(prompt);
+            expect(model.detokenize(chatSession.sequence.contextTokens).endsWith(prompt)).to.eql(true);
+        });
+
+        test("completing a prompt works", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 2048
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            expect(chatSession.chatWrapper).to.be.an.instanceof(Llama3ChatWrapper);
+
+            const prompt = "Describe the appearance of a llama and explain what";
+            const completion = await chatSession.preloadPrompt(prompt, {
+                maxTokens: 40
+            });
+            expect(completion).to.eql(" it is.");
+        });
+
         // disabled due to getting timeout in the CI due to taking too long
         test.skip("context shift works correctly", {timeout: 1000 * 60 * 60 * 2}, async () => {
             const contextSize = 2048;

From 21629bce34464512bbd7aac529ad7a31f07a564c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Fri, 24 May 2024 19:27:04 +0300
Subject: [PATCH 03/39] chore: remove redundant setting in script

---
 test/utils/setupAndTestOnPaperspace.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/utils/setupAndTestOnPaperspace.sh b/test/utils/setupAndTestOnPaperspace.sh
index a3417f3e..b0804777 100644
--- a/test/utils/setupAndTestOnPaperspace.sh
+++ b/test/utils/setupAndTestOnPaperspace.sh
@@ -178,10 +178,10 @@ while true; do
     node ./dist/cli/cli.js inspect gpu
 
     echo "Running tests using CUDA..."
-    NODE_LLAMA_CPP_GPU=cuda NODE_LLAMA_CPP_LOG_LEVEL=warn npm run --silent test
+    NODE_LLAMA_CPP_GPU=cuda npm run --silent test
 
     echo "Running tests using Vulkan..."
-    NODE_LLAMA_CPP_GPU=vulkan NODE_LLAMA_CPP_LOG_LEVEL=warn npm run --silent test
+    NODE_LLAMA_CPP_GPU=vulkan npm run --silent test
 
     echo ""
     echo "Done running tests"

From c578ddb81e034fa7d34a640b285d3d02fd646ba9 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Fri, 24 May 2024 19:27:14 +0300
Subject: [PATCH 04/39] fix: bug

---
 .../src/App/components/ChatHistory/ChatHistory.css              | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/templates/electron-typescript-react/src/App/components/ChatHistory/ChatHistory.css b/templates/electron-typescript-react/src/App/components/ChatHistory/ChatHistory.css
index 72194661..a6cb6e0d 100644
--- a/templates/electron-typescript-react/src/App/components/ChatHistory/ChatHistory.css
+++ b/templates/electron-typescript-react/src/App/components/ChatHistory/ChatHistory.css
@@ -15,6 +15,7 @@
         margin-inline-start: 48px;
         margin-inline-end: 12px;
         color: var(--user-message-text-color);
+        white-space: pre-wrap;
 
         &:not(:first-child) {
             margin-top: 36px;
@@ -25,6 +26,7 @@
         align-self: flex-start;
         margin-inline-end: 48px;
         padding-inline-start: 24px;
+        white-space: pre-wrap;
 
         &.active {
             &:after {

From 2d38a7ea9526f7798c5a4d8f707dac5777eb863b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 25 May 2024 05:25:13 +0300
Subject: [PATCH 05/39] feat: prompt completion engine

---
 .../generic/JinjaTemplateChatWrapper.ts       |   2 +-
 .../generic/TemplateChatWrapper.ts            |   2 +-
 src/evaluator/LlamaChat/LlamaChat.ts          |  91 ++++--
 .../LlamaChatSession/LlamaChatSession.ts      | 188 +++++++-----
 .../LlamaChatSessionPromptCompletionEngine.ts | 282 ++++++++++++++++++
 src/index.ts                                  |   9 +-
 src/utils/LruCache.ts                         |  58 ++++
 src/utils/getConsoleLogPrefix.ts              |   1 -
 src/utils/wrapAbortSignal.ts                  |  10 +
 9 files changed, 537 insertions(+), 106 deletions(-)
 create mode 100644 src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
 create mode 100644 src/utils/LruCache.ts
 create mode 100644 src/utils/wrapAbortSignal.ts

diff --git a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
index c8986412..da213c4c 100644
--- a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
@@ -102,7 +102,7 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
         this.trimLeadingWhitespaceInResponses = trimLeadingWhitespaceInResponses;
 
         this.settings = {
-            ...super.settings,
+            ...ChatWrapper.defaultSetting,
             functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSetting.functions
         };
 
diff --git a/src/chatWrappers/generic/TemplateChatWrapper.ts b/src/chatWrappers/generic/TemplateChatWrapper.ts
index 6ad6d930..112250e7 100644
--- a/src/chatWrappers/generic/TemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/TemplateChatWrapper.ts
@@ -86,7 +86,7 @@ export class TemplateChatWrapper extends ChatWrapper {
         this._parsedChatHistoryTemplate = parseChatHistoryTemplate(historyTemplate);
 
         this.settings = {
-            ...super.settings,
+            ...ChatWrapper.defaultSetting,
             functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSetting.functions
         };
     }
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 7d0af385..2aa54e65 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -412,7 +412,7 @@ export class LlamaChat {
                         generateResponseState.handleInitiallyEngagedFunctionModeFunctionDetection();
                         generateResponseState.handleFunctionSyntax();
 
-                        const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax();
+                        const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax("model");
                         if (functionEndSyntaxRes != null)
                             return functionEndSyntaxRes;
 
@@ -421,7 +421,7 @@ export class LlamaChat {
                         generateResponseState.popStreamRegulatorFreeTokens();
                         generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
 
-                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
+                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
                         if (stopGenerationTriggerRes != null)
                             return stopGenerationTriggerRes;
 
@@ -429,14 +429,14 @@ export class LlamaChat {
 
                         generateResponseState.moveFreePendingTokensToRes();
 
-                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
+                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("model");
                         if (maxTokensTriggerRes != null)
                             return maxTokensTriggerRes;
 
                         if (generateResponseState.updateShouldContextShift())
                             break;
 
-                        const abortRes = generateResponseState.handleAbortTrigger();
+                        const abortRes = generateResponseState.handleAbortTrigger("model");
                         if (abortRes != null)
                             return abortRes;
                     }
@@ -485,6 +485,13 @@ export class LlamaChat {
             } = {}
         } = options;
 
+        const lastEvaluationContextWindowHistoryItem = lastEvaluationContextWindowHistory == null
+            ? null
+            : lastEvaluationContextWindowHistory[lastEvaluationContextWindowHistory.length - 1];
+        const lastEvaluationContextWindowUserMessage = lastEvaluationContextWindowHistoryItem?.type === "user"
+            ? lastEvaluationContextWindowHistoryItem.text
+            : "";
+
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -508,7 +515,12 @@ export class LlamaChat {
                 contextShift,
                 customStopTriggers,
                 lastEvaluationContextWindow: {
-                    history: lastEvaluationContextWindowHistory,
+                    history: lastEvaluationContextWindowHistory == null
+                        ? undefined
+                        : setLastUserTextInChatHistory(
+                            lastEvaluationContextWindowHistory,
+                            lastEvaluationContextWindowUserMessage + initialUserPrompt
+                        ),
                     minimumOverlapPercentageToPreventContextShift
                 }
             }
@@ -550,7 +562,7 @@ export class LlamaChat {
                             completion: "",
                             lastEvaluation: {
                                 contextWindow: setLastUserTextInChatHistory(
-                                    generateResponseState.contextWindowHistory,
+                                    generateResponseState.lastContextWindowHistory,
                                     initialUserMessage
                                 ),
                                 contextShiftMetadata: generateResponseState.lastHistoryCompressionMetadata
@@ -569,13 +581,13 @@ export class LlamaChat {
 
                         generateResponseState.popStreamRegulatorFreeTokens();
 
-                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger();
+                        const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user");
                         if (stopGenerationTriggerRes != null)
                             return {
                                 completion: stopGenerationTriggerRes.response,
                                 lastEvaluation: {
                                     contextWindow: setLastUserTextInChatHistory(
-                                        generateResponseState.contextWindowHistory,
+                                        generateResponseState.lastContextWindowHistory,
                                         initialUserMessage
                                     ),
                                     contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata
@@ -587,13 +599,13 @@ export class LlamaChat {
 
                         generateResponseState.moveFreePendingTokensToRes(false);
 
-                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger();
+                        const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("user");
                         if (maxTokensTriggerRes != null)
                             return {
                                 completion: maxTokensTriggerRes.response,
                                 lastEvaluation: {
                                     contextWindow: setLastUserTextInChatHistory(
-                                        generateResponseState.contextWindowHistory,
+                                        generateResponseState.lastContextWindowHistory,
                                         initialUserMessage
                                     ),
                                     contextShiftMetadata: maxTokensTriggerRes.lastEvaluation.contextShiftMetadata
@@ -604,13 +616,13 @@ export class LlamaChat {
                         if (generateResponseState.updateShouldContextShift())
                             break;
 
-                        const abortRes = generateResponseState.handleAbortTrigger();
+                        const abortRes = generateResponseState.handleAbortTrigger("user");
                         if (abortRes != null)
                             return {
                                 completion: abortRes.response,
                                 lastEvaluation: {
                                     contextWindow: setLastUserTextInChatHistory(
-                                        generateResponseState.contextWindowHistory,
+                                        generateResponseState.lastContextWindowHistory,
                                         initialUserMessage
                                     ),
                                     contextShiftMetadata: abortRes.lastEvaluation.contextShiftMetadata
@@ -850,7 +862,7 @@ function setLastModelTextResponseInChatHistory(chatHistory: ChatHistoryItem[], t
     return newChatHistory;
 }
 
-function setLastUserTextInChatHistory(chatHistory: ChatHistoryItem[], textResponse: string) {
+function setLastUserTextInChatHistory(chatHistory: ChatHistoryItem[], userText: string) {
     const newChatHistory = chatHistory.slice();
     if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user")
         newChatHistory.push({
@@ -862,11 +874,18 @@ function setLastUserTextInChatHistory(chatHistory: ChatHistoryItem[], textRespon
     const newLastUserItem = {...lastUserItem};
     newChatHistory[newChatHistory.length - 1] = newLastUserItem;
 
-    newLastUserItem.text = textResponse;
+    newLastUserItem.text = userText;
 
     return newChatHistory;
 }
 
+function setLastTextInChatHistory(itemType: "user" | "model", chatHistory: ChatHistoryItem[], text: string) {
+    if (itemType === "user")
+        return setLastUserTextInChatHistory(chatHistory, text);
+    else
+        return setLastModelTextResponseInChatHistory(chatHistory, text);
+}
+
 function generateContextText(
     endWithUserText: boolean,
     chatWrapper: ChatWrapper,
@@ -950,7 +969,13 @@ async function getContextWindow({
     if (isFirstEvaluation && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) {
         const newContextWindow = lastEvaluationContextWindowHistory.slice();
 
-        if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "model")
+        if (endWithUserText) {
+            if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "user")
+                newContextWindow.push({
+                    type: "user",
+                    text: ""
+                });
+        } else if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "model")
             newContextWindow.push({
                 type: "model",
                 response: []
@@ -1172,7 +1197,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     public shouldContextShift = false;
     public queuedChunkTokens: Token[] = [];
 
-    public contextWindowHistory: ChatHistoryItem[] = [];
+    private contextWindowHistory: ChatHistoryItem[] = [];
     public stopGenerationTriggers: LlamaText[] = [];
     public contextWindowTokens: Token[] = [];
     public newResolvedHistory: ChatHistoryItem[] = [];
@@ -1557,7 +1582,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             },
             tokenBias: this.tokenBias,
             evaluationPriority: this.evaluationPriority,
-            yieldEogToken: true,
+            yieldEogToken: true
         }));
     }
 
@@ -1761,7 +1786,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         }
     }
 
-    public detectFunctionEndSyntax(): LlamaChatResponse<Functions> | undefined {
+    public detectFunctionEndSyntax(lastHistoryItemType: "user" | "model"): LlamaChatResponse<Functions> | undefined {
         if (this.inFunctionEvaluationMode && this.functionSyntaxEndDetector.hasTriggeredStops && this.functionsGrammar != null) {
             const functionCallText = this.llamaChat.model.detokenize(this.functionCallTokens);
             const functionCall = this.functionsGrammar.parseFunctionCall(functionCallText);
@@ -1777,11 +1802,13 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             return {
                 response: modelResponse,
                 lastEvaluation: {
-                    contextWindow: setLastModelTextResponseInChatHistory(
+                    contextWindow: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.lastContextWindowHistory,
                         this.contextWindowLastModelResponse + contextWindowModelResponse
                     ),
-                    cleanHistory: setLastModelTextResponseInChatHistory(
+                    cleanHistory: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.resolvedHistory,
                         this.lastModelResponse + modelResponse
                     ),
@@ -1819,7 +1846,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
     }
 
-    public handleStopGenerationTrigger() {
+    public handleStopGenerationTrigger(lastHistoryItemType: "user" | "model") {
         if (this.stopGenerationDetector.hasTriggeredStops || this.customStopGenerationTriggersDetector.hasTriggeredStops ||
             this.llamaChat.model.isEogToken(this.currentToken)
         ) {
@@ -1863,11 +1890,13 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             }
 
             const lastEvaluation = {
-                contextWindow: setLastModelTextResponseInChatHistory(
+                contextWindow: setLastTextInChatHistory(
+                    lastHistoryItemType,
                     this.lastContextWindowHistory,
                     this.contextWindowLastModelResponse + contextWindowModelResponse
                 ),
-                cleanHistory: setLastModelTextResponseInChatHistory(
+                cleanHistory: setLastTextInChatHistory(
+                    lastHistoryItemType,
                     this.resolvedHistory,
                     this.lastModelResponse + modelResponse
                 ),
@@ -1932,7 +1961,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         }
     }
 
-    public handleMaxTokensTrigger() {
+    public handleMaxTokensTrigger(lastHistoryItemType: "user" | "model") {
         if (this.isMaxTokensTriggered()) {
             let modelResponse = this.llamaChat.model.detokenize(this.res);
             let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
@@ -1945,11 +1974,13 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             return {
                 response: modelResponse,
                 lastEvaluation: {
-                    contextWindow: setLastModelTextResponseInChatHistory(
+                    contextWindow: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.lastContextWindowHistory,
                         this.contextWindowLastModelResponse + contextWindowModelResponse
                     ),
-                    cleanHistory: setLastModelTextResponseInChatHistory(
+                    cleanHistory: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.resolvedHistory,
                         this.lastModelResponse + modelResponse
                     ),
@@ -1969,7 +2000,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return this.shouldContextShift;
     }
 
-    public handleAbortTrigger() {
+    public handleAbortTrigger(lastHistoryItemType: "user" | "model") {
         if (this.signal?.aborted && this.stopOnAbortSignal) {
             if (this.res.length === 0)
                 throw this.signal.reason;
@@ -1985,11 +2016,13 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             return {
                 response: modelResponse,
                 lastEvaluation: {
-                    contextWindow: setLastModelTextResponseInChatHistory(
+                    contextWindow: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.lastContextWindowHistory,
                         this.contextWindowLastModelResponse + contextWindowModelResponse
                     ),
-                    cleanHistory: setLastModelTextResponseInChatHistory(
+                    cleanHistory: setLastTextInChatHistory(
+                        lastHistoryItemType,
                         this.resolvedHistory,
                         this.lastModelResponse + modelResponse
                     ),
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 315ea186..15657ea8 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -9,6 +9,10 @@ import {LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse} from "../Lla
 import {EvaluationPriority} from "../LlamaContext/types.js";
 import {TokenBias} from "../TokenBias.js";
 import {LlamaText} from "../../utils/LlamaText.js";
+import {wrapAbortSignal} from "../../utils/wrapAbortSignal.js";
+import {
+    LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine
+} from "./utils/LlamaChatSessionPromptCompletionEngine.js";
 
 
 export type LlamaChatSessionOptions = {
@@ -130,11 +134,11 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
     documentFunctionParams?: boolean
 });
 
-export type LLamaChatPreloadPromptOptions = {
+export type LLamaChatCompletePromptOptions = {
     /**
-     * If set to a value greater than `0`, a completion for the given user prompt will be generated up to the given number of tokens.
+     * Generate a completion for the given user prompt up to the given number of tokens.
      *
-     * Defaults to `0`.
+     * Defaults to `256` or half the context size, whichever is smaller.
      */
     maxTokens?: LLamaChatPromptOptions["maxTokens"],
 
@@ -179,6 +183,13 @@ export type LLamaChatPreloadPromptOptions = {
     documentFunctionParams?: boolean
 };
 
+export type LLamaChatPreloadPromptOptions = {
+    signal?: LLamaChatCompletePromptOptions["signal"],
+    evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"],
+    functions?: LLamaChatCompletePromptOptions["functions"],
+    documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"]
+};
+
 export type LlamaChatSessionRepeatPenalty = {
     /**
      * Number of recent tokens generated by the model to apply penalties to repetition of.
@@ -224,6 +235,8 @@ export class LlamaChatSession {
     /** @internal */ private _chatHistory: ChatHistoryItem[];
     /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
     /** @internal */ private _chat: LlamaChat | null;
+    /** @internal */ public _chatHistoryStateRef = {};
+    /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set<AbortController>();
 
     public readonly onDispose = new EventRelay<void>();
 
@@ -366,8 +379,10 @@ export class LlamaChatSession {
         if (grammar != null && grammar._llama !== this.model._llama)
             throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
 
+        this._stopAllPreloadAndPromptCompletions();
         return await withLock(this._chatLock, "evaluation", signal, async () => {
             this._ensureNotDisposed();
+            this._stopAllPreloadAndPromptCompletions();
 
             if (this._chat == null)
                 throw new DisposedError();
@@ -468,6 +483,7 @@ export class LlamaChatSession {
 
                 this._lastEvaluation = lastEvaluation;
                 this._chatHistory = newChatHistory;
+                this._chatHistoryStateRef = {};
 
                 const lastModelResponseItem = getLastModelResponseItem(newChatHistory);
                 const responseText = lastModelResponseItem.response
@@ -497,39 +513,51 @@ export class LlamaChatSession {
      * Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
      * and feel faster.
      *
-     * If `maxTokens` is set to a value greater than `0`,
-     * a completion for the given user prompt will be generated up to the given number of tokens.
+     * > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
+     * @param prompt - the prompt to preload
+     * @param [options]
+     */
+    public async preloadPrompt(prompt: string, options: LLamaChatPreloadPromptOptions = {}): Promise<void> {
+        await this.completePromptWithMeta(prompt, {
+            ...options,
+            maxTokens: 0
+        });
+    }
+
+    /**
+     * Preload a user prompt into the current context sequence state and generate a completion for it.
      *
      * > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
      * > so consider limiting the length of prompts you preload.
      * >
-     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount.
-     *
-     * Defaults to `0`.
+     * > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
      * @param prompt - the prompt to preload
      * @param [options]
      */
-    public async preloadPrompt<const MaxTokens extends number | 0 | undefined = 0>(
-        prompt: string,
-        options: LLamaChatPreloadPromptOptions & {
-            maxTokens?: MaxTokens
-        } = {}
-    ): Promise<0 | undefined extends MaxTokens ? void : string> {
-        const {completion} = await this.preloadPromptWithMeta(prompt, options);
-
-        if (options?.maxTokens == null || options?.maxTokens === 0)
-            return undefined as (0 | undefined extends MaxTokens ? void : string);
-
-        return completion as (0 | undefined extends MaxTokens ? void : string);
+    public async completePrompt(prompt: string, options: LLamaChatCompletePromptOptions = {}): Promise<string> {
+        const {completion} = await this.completePromptWithMeta(prompt, options);
+
+        return completion;
+    }
+
+    /**
+     * Create a smart completion engine that caches the prompt completions
+     * and reuses them when the user prompt matches the beginning of the cached prompt or completion.
+     *
+     * All completions are made and cache is used only for the current chat session state.
+     * You can create a single completion engine for an entire chat session.
+     */
+    public createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions) {
+        return LlamaChatSessionPromptCompletionEngine._create(this, options);
     }
 
     /**
-     * See `preloadPrompt` for more information.
+     * See `completePrompt` for more information.
      * @param prompt
      * @param [options]
      */
-    public async preloadPromptWithMeta(prompt: string, {
-        maxTokens = 0,
+    public async completePromptWithMeta(prompt: string, {
+        maxTokens,
         stopOnAbortSignal = false,
 
         functions,
@@ -546,70 +574,77 @@ export class LlamaChatSession {
         tokenBias,
         customStopTriggers,
         evaluationPriority
-    }: LLamaChatPreloadPromptOptions = {}) {
+    }: LLamaChatCompletePromptOptions = {}) {
         this._ensureNotDisposed();
 
         if (grammar != null && grammar._llama !== this.model._llama)
             throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
 
-        return await withLock(this._chatLock, "evaluation", signal, async () => {
-            this._ensureNotDisposed();
+        const abortController = wrapAbortSignal(signal);
+        this._preloadAndCompleteAbortControllers.add(abortController);
 
-            if (this._chat == null)
-                throw new DisposedError();
+        try {
+            return await withLock(this._chatLock, "evaluation", abortController.signal, async () => {
+                this._ensureNotDisposed();
 
-            const {completion, lastEvaluation, metadata} = await this._chat.loadChatAndCompleteUserMessage(this._chatHistory, {
-                initialUserPrompt: prompt,
-                functions,
-                documentFunctionParams,
-                grammar,
-                onToken,
-                signal,
-                stopOnAbortSignal: true,
-                repeatPenalty,
-                minP,
-                topK,
-                topP,
-                tokenBias,
-                customStopTriggers,
-                maxTokens,
-                temperature,
-                trimWhitespaceSuffix,
-                contextShift: {
-                    ...this._contextShift,
-                    lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
-                },
-                evaluationPriority,
-                lastEvaluationContextWindow: {
-                    history: this._lastEvaluation?.contextWindow,
-                    minimumOverlapPercentageToPreventContextShift: 0.8
-                }
-            });
-            this._ensureNotDisposed();
+                if (this._chat == null)
+                    throw new DisposedError();
 
-            this._lastEvaluation = {
-                cleanHistory: this._chatHistory,
-                contextWindow: lastEvaluation.contextWindow,
-                contextShiftMetadata: lastEvaluation.contextShiftMetadata
-            };
+                const {completion, lastEvaluation, metadata} = await this._chat.loadChatAndCompleteUserMessage(this._chatHistory, {
+                    initialUserPrompt: prompt,
+                    functions,
+                    documentFunctionParams,
+                    grammar,
+                    onToken,
+                    signal: abortController.signal,
+                    stopOnAbortSignal: true,
+                    repeatPenalty,
+                    minP,
+                    topK,
+                    topP,
+                    tokenBias,
+                    customStopTriggers,
+                    maxTokens,
+                    temperature,
+                    trimWhitespaceSuffix,
+                    contextShift: {
+                        ...this._contextShift,
+                        lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
+                    },
+                    evaluationPriority,
+                    lastEvaluationContextWindow: {
+                        history: this._lastEvaluation?.contextWindow,
+                        minimumOverlapPercentageToPreventContextShift: 0.8
+                    }
+                });
+                this._ensureNotDisposed();
 
-            if (!stopOnAbortSignal && metadata.stopReason === "abort" && signal?.aborted)
-                throw signal.reason;
+                this._lastEvaluation = {
+                    cleanHistory: this._chatHistory,
+                    contextWindow: lastEvaluation.contextWindow,
+                    contextShiftMetadata: lastEvaluation.contextShiftMetadata
+                };
+
+                if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
+                    throw abortController.signal.reason;
+
+                if (metadata.stopReason === "customStopTrigger")
+                    return {
+                        completion: completion,
+                        stopReason: metadata.stopReason,
+                        customStopTrigger: metadata.customStopTrigger,
+                        remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
+                    };
 
-            if (metadata.stopReason === "customStopTrigger")
                 return {
                     completion: completion,
                     stopReason: metadata.stopReason,
-                    customStopTrigger: metadata.customStopTrigger,
                     remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
                 };
-
-            return {
-                completion: completion,
-                stopReason: metadata.stopReason,
-                remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
-            };
-        });
+            });
+        } finally {
+            this._preloadAndCompleteAbortControllers.delete(abortController);
+        }
     }
 
     public getChatHistory() {
@@ -625,9 +660,18 @@ export class LlamaChatSession {
 
     public setChatHistory(chatHistory: ChatHistoryItem[]) {
         this._chatHistory = structuredClone(chatHistory);
+        this._chatHistoryStateRef = {};
         this._lastEvaluation = undefined;
     }
 
+    /** @internal */
+    private _stopAllPreloadAndPromptCompletions() {
+        for (const abortController of this._preloadAndCompleteAbortControllers)
+            abortController.abort();
+
+        this._preloadAndCompleteAbortControllers.clear();
+    }
+
     /** @internal */
     private _ensureNotDisposed() {
         if (this.disposed)
diff --git a/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
new file mode 100644
index 00000000..29d61891
--- /dev/null
+++ b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
@@ -0,0 +1,282 @@
+import {DisposeAggregator, DisposedError} from "lifecycle-utils";
+import {Token} from "../../../types.js";
+import {getConsoleLogPrefix} from "../../../utils/getConsoleLogPrefix.js";
+import {LruCache} from "../../../utils/LruCache.js";
+import type {LLamaChatCompletePromptOptions, LlamaChatSession} from "../LlamaChatSession.js";
+
+export type LLamaChatPromptCompletionEngineOptions = {
+    /**
+     * Max tokens to allow for preloading a prompt and generating a completion for it.
+     *
+     * Defaults to `256` or half of the context size, whichever is smaller.
+     */
+    maxPreloadTokens?: number,
+    onGeneration?(prompt: string, completion: string): void,
+
+    /**
+     * Max number of completions to cache.
+     *
+     * Defaults to `100`.
+     */
+    maxCachedCompletions?: number,
+
+    temperature?: LLamaChatCompletePromptOptions["temperature"],
+    minP?: LLamaChatCompletePromptOptions["minP"],
+    topK?: LLamaChatCompletePromptOptions["topK"],
+    topP?: LLamaChatCompletePromptOptions["topP"],
+    trimWhitespaceSuffix?: LLamaChatCompletePromptOptions["trimWhitespaceSuffix"],
+    evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"],
+    repeatPenalty?: LLamaChatCompletePromptOptions["repeatPenalty"],
+    tokenBias?: LLamaChatCompletePromptOptions["tokenBias"],
+    customStopTriggers?: LLamaChatCompletePromptOptions["customStopTriggers"],
+    grammar?: LLamaChatCompletePromptOptions["grammar"],
+    functions?: LLamaChatCompletePromptOptions["functions"],
+    documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"]
+};
+
+const defaultMaxPreloadTokens = 256;
+const defaultMaxCachedCompletions = 100;
+
+export class LlamaChatSessionPromptCompletionEngine {
+    /** @internal */ private readonly _chatSession: LlamaChatSession;
+    /** @internal */ private readonly _maxPreloadTokens: number;
+    /** @internal */ private readonly _maxCachedCompletions: number;
+    /** @internal */ private readonly _onGeneration?: LLamaChatPromptCompletionEngineOptions["onGeneration"];
+    /** @internal */ private readonly _completionOptions: LLamaChatCompletePromptOptions;
+    /** @internal */ private readonly _completionCaches = new WeakMap<object, CompletionCache>();
+    /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
+    /** @internal */ private _currentCompletionAbortController = new AbortController();
+    /** @internal */ private _lastPrompt?: string;
+    /** @internal */ private _disposed = false;
+
+    private constructor(chatSession: LlamaChatSession, {
+        maxPreloadTokens = defaultMaxPreloadTokens,
+        onGeneration,
+        maxCachedCompletions = defaultMaxCachedCompletions,
+        ...options
+    }: LLamaChatPromptCompletionEngineOptions) {
+        this._chatSession = chatSession;
+        this._maxPreloadTokens = Math.max(1, maxPreloadTokens);
+        this._maxCachedCompletions = Math.max(1, maxCachedCompletions);
+        this._onGeneration = onGeneration;
+        this._completionOptions = options;
+
+        this.dispose = this.dispose.bind(this);
+
+        this._disposeAggregator.add(
+            this._chatSession.onDispose.createListener(this.dispose)
+        );
+        this._disposeAggregator.add(() => {
+            this._disposed = true;
+            this._currentCompletionAbortController.abort();
+        });
+    }
+
+    public dispose() {
+        if (this._disposed)
+            return;
+
+        this._disposeAggregator.dispose();
+    }
+
+    /**
+     * Get completion for the prompt from the cache,
+     * and begin preloading this prompt into the context sequence and completing it.
+     *
+     * On completion progress, `onGeneration` (configured for this engine instance) will be called.
+     */
+    public complete(prompt: string): string {
+        if (this._disposed)
+            throw new DisposedError();
+
+        const completionCache = this._getCurrentCompletionCache();
+
+        const completion = completionCache.getCompletion(prompt);
+
+        if (this._lastPrompt == null || !(this._lastPrompt + (completion ?? "")).startsWith(prompt)) {
+            this._lastPrompt = prompt;
+            this._restartCompletion(completionCache);
+        }
+
+        this._lastPrompt = prompt;
+
+        return completion ?? "";
+    }
+
+    /** @internal */
+    private _getCurrentCompletionCache() {
+        const completionCache = this._completionCaches.get(this._chatSession._chatHistoryStateRef);
+
+        if (completionCache != null)
+            return completionCache;
+
+        const newCompletionCache = new CompletionCache(this._maxCachedCompletions);
+        this._completionCaches.set(this._chatSession._chatHistoryStateRef, newCompletionCache);
+        return newCompletionCache;
+    }
+
+    /** @internal */
+    private _restartCompletion(completionCache: CompletionCache) {
+        if (this._disposed)
+            return;
+
+        this._currentCompletionAbortController.abort();
+        this._currentCompletionAbortController = new AbortController();
+        const prompt = this._lastPrompt;
+
+        if (prompt == null)
+            return;
+
+        const existingCompletion = completionCache.getCompletion(prompt);
+        const promptToComplete = prompt + (existingCompletion ?? "");
+
+        const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete).length;
+        const leftTokens = Math.max(0, this._maxPreloadTokens - currentPromptTokens);
+
+        if (leftTokens === 0)
+            return;
+
+        const currentAbortController = this._currentCompletionAbortController;
+        const currentAbortSignal = this._currentCompletionAbortController.signal;
+        const currentCompletion: Token[] = [];
+        void this._chatSession.completePrompt(promptToComplete, {
+            ...this._completionOptions,
+            stopOnAbortSignal: false,
+            maxTokens: leftTokens,
+            signal: currentAbortSignal,
+            onToken: (chunk) => {
+                currentCompletion.push(...chunk);
+                const completion = (existingCompletion ?? "") + this._chatSession.model.detokenize(currentCompletion);
+                completionCache.putCompletion(prompt, completion);
+
+                if (this._getCurrentCompletionCache() !== completionCache) {
+                    currentAbortController.abort();
+                    return;
+                }
+
+                try {
+                    if (this._lastPrompt === prompt && this._onGeneration != null)
+                        this._onGeneration(prompt, completion);
+                } catch (err) {
+                    console.error(err);
+                }
+            }
+        })
+            .then(() => {
+                if (this._lastPrompt !== prompt && this._getCurrentCompletionCache() === completionCache)
+                    return this._restartCompletion(completionCache);
+            })
+            .catch((err) => {
+                if (currentAbortSignal.aborted && err === currentAbortSignal.reason)
+                    return;
+
+                console.error(getConsoleLogPrefix(false, false), err);
+            });
+    }
+
+    /** @internal */
+    public static _create(chatSession: LlamaChatSession, options: LLamaChatPromptCompletionEngineOptions = {}) {
+        return new LlamaChatSessionPromptCompletionEngine(chatSession, options);
+    }
+}
+
+class CompletionCache {
+    /** @internal */ private readonly _cache: LruCache<string, null>;
+    /** @internal */ private readonly _rootNode: InputNode = [new Map()];
+
+    public constructor(maxInputs: number) {
+        this._cache = new LruCache(maxInputs, {
+            onDelete: (key) => {
+                this._deleteInput(key);
+            }
+        });
+    }
+
+    public get maxInputs() {
+        return this._cache.maxSize;
+    }
+
+    public getCompletion(input: string): string | null {
+        let node: InputNode | undefined = this._rootNode;
+
+        for (let i = 0; i < input.length; i++) {
+            if (node == null)
+                return null;
+
+            const [next, completion]: InputNode = node;
+            const char = input[i];
+
+            if (!next.has(char)) {
+                if (completion != null && completion.startsWith(input.slice(i))) {
+                    this._cache.get(input.slice(0, i));
+                    return completion.slice(input.length - i);
+                }
+            }
+
+            node = next.get(char);
+        }
+
+        if (node == null)
+            return null;
+
+        const [, possibleCompletion] = node;
+        if (possibleCompletion != null) {
+            this._cache.get(input);
+            return possibleCompletion;
+        }
+
+        return null;
+    }
+
+    public putCompletion(input: string, completion: string): string {
+        this._cache.set(input, null);
+
+        let node = this._rootNode;
+        for (let i = 0; i < input.length; i++) {
+            const [next] = node;
+            const char = input[i];
+
+            if (!next.has(char))
+                next.set(char, [new Map()]);
+
+            node = next.get(char)!;
+        }
+
+        const currentCompletion = node[1];
+        if (currentCompletion != null && currentCompletion.startsWith(completion))
+            return currentCompletion;
+
+        node[1] = completion;
+        return completion;
+    }
+
+    /** @internal */
+    private _deleteInput(input: string) {
+        let lastNodeWithMultipleChildren: InputNode = this._rootNode;
+        let lastNodeWithMultipleChildrenDeleteChar: string = input[0];
+
+        let node = this._rootNode;
+        for (let i = 0; i < input.length; i++) {
+            const [next] = node;
+            const char = input[i];
+
+            if (next.size > 1) {
+                lastNodeWithMultipleChildren = node;
+                lastNodeWithMultipleChildrenDeleteChar = char;
+            }
+
+            if (!next.has(char))
+                return;
+
+            node = next.get(char)!;
+        }
+
+        if (lastNodeWithMultipleChildrenDeleteChar !== "")
+            lastNodeWithMultipleChildren[0].delete(lastNodeWithMultipleChildrenDeleteChar);
+    }
+}
+
+type InputNode = [
+    next: Map<string, InputNode>,
+    completion?: string
+];
diff --git a/src/index.ts b/src/index.ts
index f2f8f25b..35a8bbae 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -18,13 +18,16 @@ import {
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
     LlamaChatSession, type LlamaChatSessionOptions, type LlamaChatSessionContextShiftOptions,
-    type LLamaChatPromptOptions, type LLamaChatPreloadPromptOptions, type LlamaChatSessionRepeatPenalty
+    type LLamaChatPromptOptions, type LLamaChatCompletePromptOptions, type LlamaChatSessionRepeatPenalty
 } from "./evaluator/LlamaChatSession/LlamaChatSession.js";
 import {defineChatSessionFunction} from "./evaluator/LlamaChatSession/utils/defineChatSessionFunction.js";
 import {
     LlamaChat, type LlamaChatOptions, type LLamaChatGenerateResponseOptions, type LLamaChatLoadAndCompleteUserMessageOptions,
     type LLamaChatContextShiftOptions, type LlamaChatResponse, type LlamaChatResponseFunctionCall, type LlamaChatLoadAndCompleteUserResponse
 } from "./evaluator/LlamaChat/LlamaChat.js";
+import {
+    LlamaChatSessionPromptCompletionEngine, type LLamaChatPromptCompletionEngineOptions
+} from "./evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js";
 import {
     LlamaCompletion, type LlamaCompletionOptions, type LlamaCompletionGenerationOptions, type LlamaInfillGenerationOptions
 } from "./evaluator/LlamaCompletion.js";
@@ -116,7 +119,7 @@ export {
     type LlamaChatSessionOptions,
     type LlamaChatSessionContextShiftOptions,
     type LLamaChatPromptOptions,
-    type LLamaChatPreloadPromptOptions,
+    type LLamaChatCompletePromptOptions,
     type LlamaChatSessionRepeatPenalty,
     LlamaChat,
     type LlamaChatOptions,
@@ -127,6 +130,8 @@ export {
     type LlamaChatResponse,
     type LlamaChatResponseFunctionCall,
     type LlamaChatLoadAndCompleteUserResponse,
+    LlamaChatSessionPromptCompletionEngine,
+    type LLamaChatPromptCompletionEngineOptions,
     LlamaCompletion,
     type LlamaCompletionOptions,
     type LlamaCompletionGenerationOptions,
diff --git a/src/utils/LruCache.ts b/src/utils/LruCache.ts
new file mode 100644
index 00000000..a77f87e0
--- /dev/null
+++ b/src/utils/LruCache.ts
@@ -0,0 +1,58 @@
+export class LruCache<Key, Value> {
+    public readonly maxSize: number;
+    /** @internal */ private readonly _cache = new Map<Key, Value>();
+    /** @internal */ private readonly _onDelete?: (key: Key, value: Value) => void;
+
+    public constructor(maxSize: number, {
+        onDelete
+    }: {
+        onDelete?(key: Key, value: Value): void
+    } = {}) {
+        this.maxSize = maxSize;
+        this._onDelete = onDelete;
+    }
+
+    public get(key: Key) {
+        if (!this._cache.has(key))
+            return undefined;
+
+        // move the key to the end of the cache
+        const item = this._cache.get(key)!;
+        this._cache.delete(key);
+        this._cache.set(key, item);
+        return item;
+    }
+
+    public set(key: Key, value: Value) {
+        if (this._cache.has(key))
+            this._cache.delete(key);
+        else if (this._cache.size >= this.maxSize) {
+            const firstKey = this.firstKey;
+
+            if (this._onDelete != null)
+                this._onDelete(firstKey, this._cache.get(firstKey)!);
+
+            this._cache.delete(firstKey);
+        }
+
+        this._cache.set(key, value);
+        return this;
+    }
+
+    public get firstKey() {
+        return this._cache.keys()
+            .next().value;
+    }
+
+    public clear() {
+        this._cache.clear();
+    }
+
+    public keys() {
+        return this._cache.keys();
+    }
+
+    public delete(key: Key) {
+        this._cache.delete(key);
+    }
+}
diff --git a/src/utils/getConsoleLogPrefix.ts b/src/utils/getConsoleLogPrefix.ts
index 6ba1c6ab..fc8f7282 100644
--- a/src/utils/getConsoleLogPrefix.ts
+++ b/src/utils/getConsoleLogPrefix.ts
@@ -10,4 +10,3 @@ export function getConsoleLogPrefix(forcePrefix: boolean = false, padEnd: boolea
 
     return "";
 }
-
diff --git a/src/utils/wrapAbortSignal.ts b/src/utils/wrapAbortSignal.ts
new file mode 100644
index 00000000..cce2dac4
--- /dev/null
+++ b/src/utils/wrapAbortSignal.ts
@@ -0,0 +1,10 @@
+export function wrapAbortSignal(abortSignal?: AbortSignal) {
+    const controller = new AbortController();
+
+    if (abortSignal != null)
+        abortSignal.addEventListener("abort", () => {
+            controller.abort(abortSignal.reason);
+        });
+
+    return controller;
+}

From 2ea5265137b25444f5fe29593d1bd8ba4b33b22c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 25 May 2024 05:28:16 +0300
Subject: [PATCH 06/39] feat: add prompt completion to the Electron example

---
 .../electron/rpc/llmRpc.ts                    |   1 +
 .../electron/state/llmState.ts                |  99 ++++++++++++-----
 .../electron-typescript-react/src/App/App.tsx |   7 ++
 .../src/App/components/Header/Header.tsx      |   2 +-
 .../src/App/components/InputRow/InputRow.css  |  96 ++++++++++++++---
 .../src/App/components/InputRow/InputRow.tsx  | 102 ++++++++++++++----
 .../src/state/llmState.ts                     |   6 +-
 7 files changed, 246 insertions(+), 67 deletions(-)

diff --git a/templates/electron-typescript-react/electron/rpc/llmRpc.ts b/templates/electron-typescript-react/electron/rpc/llmRpc.ts
index 4731a4ed..00177022 100644
--- a/templates/electron-typescript-react/electron/rpc/llmRpc.ts
+++ b/templates/electron-typescript-react/electron/rpc/llmRpc.ts
@@ -43,6 +43,7 @@ export class ElectronLlmRpc {
         getState() {
             return llmState.state;
         },
+        setDraftPrompt: llmFunctions.chatSession.setDraftPrompt,
         prompt: llmFunctions.chatSession.prompt,
         stopActivePrompt: llmFunctions.chatSession.stopActivePrompt,
         resetChatHistory: llmFunctions.chatSession.resetChatHistory
diff --git a/templates/electron-typescript-react/electron/state/llmState.ts b/templates/electron-typescript-react/electron/state/llmState.ts
index f2472859..7d3e3586 100644
--- a/templates/electron-typescript-react/electron/state/llmState.ts
+++ b/templates/electron-typescript-react/electron/state/llmState.ts
@@ -1,5 +1,5 @@
 import path from "node:path";
-import {getLlama, Llama, LlamaChatSession, LlamaContext, LlamaContextSequence, LlamaModel, Token} from "node-llama-cpp";
+import {getLlama, Llama, LlamaChatSession, LlamaChatSessionPromptCompletionEngine, LlamaContext, LlamaContextSequence, LlamaModel, Token} from "node-llama-cpp";
 import {withLock, State} from "lifecycle-utils";
 
 export const llmState = new State<LlmState>({
@@ -18,7 +18,11 @@ export const llmState = new State<LlmState>({
     chatSession: {
         loaded: false,
         generatingResult: false,
-        simplifiedChat: []
+        simplifiedChat: [],
+        draftPrompt: {
+            prompt: "",
+            completion: ""
+        }
     }
 });
 
@@ -45,7 +49,11 @@ export type LlmState = {
     chatSession: {
         loaded: boolean,
         generatingResult: boolean,
-        simplifiedChat: SimplifiedChatItem[]
+        simplifiedChat: SimplifiedChatItem[],
+        draftPrompt: {
+            prompt: string,
+            completion: string
+        }
     }
 };
 
@@ -60,6 +68,7 @@ let context: LlamaContext | null = null;
 let contextSequence: LlamaContextSequence | null = null;
 
 let chatSession: LlamaChatSession | null = null;
+let chatSessionCompletionEngine: LlamaChatSessionPromptCompletionEngine | null = null;
 let promptAbortController: AbortController | null = null;
 const inProgressResponse: Token[] = [];
 
@@ -256,6 +265,7 @@ export const llmFunctions = {
                     try {
                         chatSession.dispose();
                         chatSession = null;
+                        chatSessionCompletionEngine = null;
                     } catch (err) {
                         console.error("Failed to dispose chat session", err);
                     }
@@ -267,32 +277,12 @@ export const llmFunctions = {
                         chatSession: {
                             loaded: false,
                             generatingResult: false,
-                            simplifiedChat: []
+                            simplifiedChat: [],
+                            draftPrompt: llmState.state.chatSession.draftPrompt
                         }
                     };
 
-                    chatSession = new LlamaChatSession({
-                        contextSequence
-                    });
-                    llmState.state = {
-                        ...llmState.state,
-                        chatSession: {
-                            loaded: true,
-                            generatingResult: false,
-                            simplifiedChat: []
-                        }
-                    };
-
-                    chatSession.onDispose.createListener(() => {
-                        llmState.state = {
-                            ...llmState.state,
-                            chatSession: {
-                                loaded: false,
-                                generatingResult: false,
-                                simplifiedChat: []
-                            }
-                        };
-                    });
+                    llmFunctions.chatSession.resetChatHistory();
                 } catch (err) {
                     console.error("Failed to create chat session", err);
                     llmState.state = {
@@ -300,7 +290,8 @@ export const llmFunctions = {
                         chatSession: {
                             loaded: false,
                             generatingResult: false,
-                            simplifiedChat: []
+                            simplifiedChat: [],
+                            draftPrompt: llmState.state.chatSession.draftPrompt
                         }
                     };
                 }
@@ -359,15 +350,65 @@ export const llmFunctions = {
             if (contextSequence == null)
                 return;
 
+            chatSession?.dispose();
             chatSession = new LlamaChatSession({
-                contextSequence
+                contextSequence,
+                autoDisposeSequence: false
+            });
+            chatSessionCompletionEngine = chatSession.createPromptCompletionEngine({
+                onGeneration(prompt, completion) {
+                    if (llmState.state.chatSession.draftPrompt.prompt === prompt) {
+                        llmState.state = {
+                            ...llmState.state,
+                            chatSession: {
+                                ...llmState.state.chatSession,
+                                draftPrompt: {
+                                    prompt,
+                                    completion
+                                }
+                            }
+                        };
+                    }
+                }
             });
 
+            llmState.state = {
+                ...llmState.state,
+                chatSession: {
+                    loaded: true,
+                    generatingResult: false,
+                    simplifiedChat: [],
+                    draftPrompt: {
+                        prompt: llmState.state.chatSession.draftPrompt.prompt,
+                        completion: chatSessionCompletionEngine.complete(llmState.state.chatSession.draftPrompt.prompt)
+                    }
+                }
+            };
+
+            chatSession.onDispose.createListener(() => {
+                llmState.state = {
+                    ...llmState.state,
+                    chatSession: {
+                        loaded: false,
+                        generatingResult: false,
+                        simplifiedChat: [],
+                        draftPrompt: llmState.state.chatSession.draftPrompt
+                    }
+                };
+            });
+        },
+        setDraftPrompt(prompt: string) {
+            if (chatSessionCompletionEngine == null)
+                return;
+
             llmState.state = {
                 ...llmState.state,
                 chatSession: {
                     ...llmState.state.chatSession,
-                    simplifiedChat: []
+                    draftPrompt: {
+                        prompt: prompt,
+                        completion: chatSessionCompletionEngine.complete(prompt)
+                    }
                 }
             };
         }
diff --git a/templates/electron-typescript-react/src/App/App.tsx b/templates/electron-typescript-react/src/App/App.tsx
index a82d4aa5..a6cff083 100644
--- a/templates/electron-typescript-react/src/App/App.tsx
+++ b/templates/electron-typescript-react/src/App/App.tsx
@@ -67,6 +67,10 @@ export function App() {
         void electronLlmRpc.prompt(prompt);
     }, [generatingResult]);
 
+    const onPromptInput = useCallback((currentText: string) => {
+        void electronLlmRpc.setDraftPrompt(currentText);
+    }, []);
+
     const error = state.llama.error ?? state.model.error ?? state.context.error ?? state.contextSequence.error;
     const showMessage = state.selectedModelFilePath == null || error != null || state.chatSession.simplifiedChat.length === 0;
 
@@ -121,9 +125,12 @@ export function App() {
                     ? stopActivePrompt
                     : undefined
             }
+            onPromptInput={onPromptInput}
             sendPrompt={sendPrompt}
             generatingResult={generatingResult}
             contextSequenceLoaded={state.contextSequence.loaded}
+            autocompleteInputDraft={state.chatSession.draftPrompt.prompt}
+            autocompleteCompletion={state.chatSession.draftPrompt.completion}
         />
     </div>;
 }
diff --git a/templates/electron-typescript-react/src/App/components/Header/Header.tsx b/templates/electron-typescript-react/src/App/components/Header/Header.tsx
index cc0ff213..17f59385 100644
--- a/templates/electron-typescript-react/src/App/components/Header/Header.tsx
+++ b/templates/electron-typescript-react/src/App/components/Header/Header.tsx
@@ -12,7 +12,7 @@ export function Header({modelName, onLoadClick, loadPercentage, onResetChatClick
             <div
                 className={classNames("progress", loadPercentage === 1 && "hide")}
                 style={{
-                    "--progress": loadPercentage ?? undefined
+                    "--progress": loadPercentage != null ? (loadPercentage * 100) : undefined
                 } as CSSProperties}
             />
 
diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
index 51491f6d..4a9bb3f5 100644
--- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
+++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
@@ -13,24 +13,90 @@
     z-index: 10;
     align-items: flex-end;
 
-    > .input {
+    > .inputContainer {
         flex: 1;
-        border: none;
-        resize: none;
-        box-sizing: border-box;
-        max-height: 160px;
-        height: 55px;
-        outline: none;
-        padding: 12px 24px;
-        background-color: transparent;
-        font: inherit;
-        align-content: center;
-        align-self: stretch;
-        color: var(--panel-text-color);
+        display: flex;
+        flex-direction: row;
+        overflow: hidden;
+        position: relative;
+        isolation: isolate;
+        max-height: 400px;
+        min-height: var(--min-height);
+        --min-height: 55px;
 
-        &::placeholder {
+        > .input {
+            flex: 1;
+            border: none;
+            resize: none;
+            box-sizing: border-box;
+            max-height: 160px;
+            min-height: var(--min-height);
+            height: 55px;
+            outline: none;
+            padding: calc((var(--min-height) - 1lh) / 2) 24px;
+            background-color: transparent;
+            font: inherit;
+            align-content: center;
+            align-self: stretch;
             color: var(--panel-text-color);
-            opacity: 0.4;
+            z-index: 2;
+            unicode-bidi: plaintext;
+            overflow: auto;
+
+            &::placeholder {
+                color: var(--panel-text-color);
+                opacity: 0.4;
+            }
+        }
+
+        > .autocomplete {
+            position: absolute;
+            inset: 0px;
+            z-index: 1;
+            display: flex;
+            overflow: hidden;
+            pointer-events: none;
+            user-select: none;
+
+            > .content {
+                flex: 1;
+                flex-shrink: 0;
+                font: inherit;
+                padding: calc((var(--min-height) - 1lh) / 2) 24px;
+                text-align: initial;
+                unicode-bidi: plaintext;
+                overflow: hidden;
+                opacity: 0.36;
+
+                &.hide {
+                    opacity: 0;
+                }
+
+                > .currentText {
+                    opacity: 0;
+                    display: inline;
+                    white-space: pre-wrap;
+                    word-break: break-word;
+                }
+
+                > .completion {
+                    display: inline;
+                    white-space: pre-wrap;
+                    word-break: break-word;
+                }
+
+                > .pressTab {
+                    display: inline-block;
+                    margin: -1px 8px;
+                    opacity: 0.8;
+                    border: solid 1px color-mix(in srgb, currentColor, transparent 64%);
+                    border-bottom-width: 2px;
+                    border-radius: 8px;
+                    padding: 0.1em 0.4em;
+                    font-size: 0.8em;
+                    vertical-align: top;
+                }
+            }
         }
     }
 
diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
index 44dc1db5..a8ebdb37 100644
--- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
+++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
@@ -1,20 +1,47 @@
-import {useCallback, useRef, useState} from "react";
+import {useCallback, useMemo, useRef, useState} from "react";
+import classNames from "classnames";
 import {AddMessageIconSVG} from "../../../icons/AddMessageIconSVG.tsx";
 import {AbortIconSVG} from "../../../icons/AbortIconSVG.tsx";
 
 import "./InputRow.css";
 
 
-export function InputRow({stopGeneration, sendPrompt, generatingResult, contextSequenceLoaded}: InputRowProps) {
-    const [inputEmpty, setInputEmpty] = useState(true);
+export function InputRow({
+    stopGeneration, sendPrompt, onPromptInput, autocompleteInputDraft, autocompleteCompletion, generatingResult, contextSequenceLoaded
+}: InputRowProps) {
+    const [inputText, setInputText] = useState<string>("");
     const inputRef = useRef<HTMLTextAreaElement>(null);
+    const autocompleteRef = useRef<HTMLDivElement>(null);
+    const autocompleteCurrentTextRef = useRef<HTMLDivElement>(null);
+
+    const autocompleteText = useMemo(() => {
+        const fullText = (autocompleteInputDraft ?? "") + (autocompleteCompletion ?? "");
+        if (fullText.startsWith(inputText))
+            return fullText.slice(inputText.length);
+
+        return "";
+    }, [inputText, autocompleteInputDraft, autocompleteCompletion]);
+
+    const setInputValue = useCallback((value: string) => {
+        if (inputRef.current != null)
+            inputRef.current.value = value;
+
+        if (autocompleteCurrentTextRef.current != null)
+            autocompleteCurrentTextRef.current.innerText = value;
+
+        setInputText(value);
+    }, []);
 
     const resizeInput = useCallback(() => {
         if (inputRef.current == null)
             return;
 
-        inputRef.current.style.minHeight = "";
-        inputRef.current.style.minHeight = inputRef.current.scrollHeight + "px";
+        inputRef.current.style.height = "";
+        inputRef.current.style.height = inputRef.current.scrollHeight + "px";
+
+        if (autocompleteRef.current != null) {
+            autocompleteRef.current.scrollTop = inputRef.current.scrollTop;
+        }
     }, []);
 
     const submitPrompt = useCallback(() => {
@@ -25,35 +52,65 @@ export function InputRow({stopGeneration, sendPrompt, generatingResult, contextS
         if (message.length === 0)
             return;
 
-        inputRef.current.value = "";
+        setInputValue("");
         resizeInput();
         sendPrompt(message);
-    }, [generatingResult, resizeInput, sendPrompt]);
+        onPromptInput?.("");
+    }, [setInputValue, generatingResult, resizeInput, sendPrompt, onPromptInput]);
 
     const onInput = useCallback(() => {
-        setInputEmpty(inputRef.current?.value.length === 0);
+        setInputText(inputRef.current?.value ?? "");
         resizeInput();
-    }, [resizeInput]);
+
+        if (autocompleteCurrentTextRef.current != null && inputRef.current != null)
+            autocompleteCurrentTextRef.current.innerText = inputRef.current?.value;
+
+        if (inputRef.current != null && onPromptInput != null)
+            onPromptInput(inputRef.current?.value);
+    }, [resizeInput, onPromptInput]);
 
     const onInputKeyDown = useCallback((event: React.KeyboardEvent<HTMLTextAreaElement>) => {
         if (event.key === "Enter" && !event.shiftKey) {
             event.preventDefault();
             submitPrompt();
+            resizeInput();
+        } else if (event.key === "Tab" && !event.shiftKey && !event.ctrlKey && !event.metaKey && !event.altKey) {
+            event.preventDefault();
+            if (inputRef.current != null && autocompleteText !== "") {
+                setInputValue(inputRef.current.value + autocompleteText);
+                inputRef.current.scrollTop = inputRef.current.scrollHeight;
+                onPromptInput?.(inputRef.current.value);
+            }
+
             resizeInput();
         }
-    }, [submitPrompt]);
+    }, [submitPrompt, setInputValue, onPromptInput, resizeInput, autocompleteText]);
 
     return <div className="appInputRow">
-        <textarea
-            ref={inputRef}
-            onInput={onInput}
-            onKeyDown={onInputKeyDown}
-            className="input"
-            autoComplete="off"
-            spellCheck
-            disabled={!contextSequenceLoaded}
-            placeholder="Type a message..."
-        />
+        <div className="inputContainer">
+            <textarea
+                ref={inputRef}
+                onInput={onInput}
+                onKeyDownCapture={onInputKeyDown}
+                className="input"
+                autoComplete="off"
+                spellCheck
+                disabled={!contextSequenceLoaded}
+                onScroll={resizeInput}
+                placeholder={
+                    autocompleteText === ""
+                        ? "Type a message..."
+                        : ""
+                }
+            />
+            <div className="autocomplete" ref={autocompleteRef}>
+                <div className={classNames("content", autocompleteText === "" && "hide")}>
+                    <div className="currentText" ref={autocompleteCurrentTextRef}/>
+                    <div className="completion">{autocompleteText}</div>
+                    <div className="pressTab">Tab</div>
+                </div>
+            </div>
+        </div>
         <button
             className="stopGenerationButton"
             disabled={stopGeneration == null || !generatingResult}
@@ -63,7 +120,7 @@ export function InputRow({stopGeneration, sendPrompt, generatingResult, contextS
         </button>
         <button
             className="sendButton"
-            disabled={!contextSequenceLoaded || inputEmpty || generatingResult}
+            disabled={!contextSequenceLoaded || inputText === "" || generatingResult}
             onClick={submitPrompt}
         >
             <AddMessageIconSVG className="icon" />
@@ -74,6 +131,9 @@ export function InputRow({stopGeneration, sendPrompt, generatingResult, contextS
 type InputRowProps = {
     stopGeneration?(): void,
     sendPrompt(prompt: string): void,
+    onPromptInput?(currentText: string): void,
+    autocompleteInputDraft?: string,
+    autocompleteCompletion?: string,
     generatingResult: boolean,
     contextSequenceLoaded: boolean
 };
diff --git a/templates/electron-typescript-react/src/state/llmState.ts b/templates/electron-typescript-react/src/state/llmState.ts
index 6b5c55cd..c436eb2a 100644
--- a/templates/electron-typescript-react/src/state/llmState.ts
+++ b/templates/electron-typescript-react/src/state/llmState.ts
@@ -18,6 +18,10 @@ export const llmState = new State<LlmState>({
     chatSession: {
         loaded: false,
         generatingResult: false,
-        simplifiedChat: []
+        simplifiedChat: [],
+        draftPrompt: {
+            prompt: "",
+            completion: ""
+        }
     }
 });

From 33c736075f17eef6ee895cd8745d82165e543e6a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 25 May 2024 05:29:03 +0300
Subject: [PATCH 07/39] test: fix test

---
 test/modelDependent/llama3/chatSession.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/modelDependent/llama3/chatSession.test.ts b/test/modelDependent/llama3/chatSession.test.ts
index b12043fd..9696590c 100644
--- a/test/modelDependent/llama3/chatSession.test.ts
+++ b/test/modelDependent/llama3/chatSession.test.ts
@@ -101,7 +101,7 @@ describe("llama 3", () => {
             expect(chatSession.chatWrapper).to.be.an.instanceof(Llama3ChatWrapper);
 
             const prompt = "Describe the appearance of a llama and explain what";
-            const completion = await chatSession.preloadPrompt(prompt, {
+            const completion = await chatSession.completePrompt(prompt, {
                 maxTokens: 40
             });
             expect(completion).to.eql(" it is.");

From 3dd6db152cd1f717b47f7ce9a0af2ad27a44713f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 25 May 2024 05:30:20 +0300
Subject: [PATCH 08/39] chore: update some dev dependencies

---
 package-lock.json | 334 +++++++++++++++++++++++++++-------------------
 package.json      |   8 +-
 2 files changed, 203 insertions(+), 139 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index a1a91e9f..b03b3d69 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -59,15 +59,15 @@
         "@types/validate-npm-package-name": "^4.0.2",
         "@types/which": "^3.0.0",
         "@types/yargs": "^17.0.24",
-        "@typescript-eslint/eslint-plugin": "^6.3.0",
-        "@typescript-eslint/parser": "^6.3.0",
+        "@typescript-eslint/eslint-plugin": "^7.10.0",
+        "@typescript-eslint/parser": "^7.10.0",
         "@vitest/coverage-v8": "^1.4.0",
         "@vitest/ui": "^1.4.0",
         "copyfiles": "^2.4.1",
         "eslint": "^8.46.0",
         "eslint-plugin-import": "^2.28.0",
         "eslint-plugin-jsdoc": "^46.9.0",
-        "eslint-plugin-n": "^16.3.1",
+        "eslint-plugin-n": "^17.7.0",
         "husky": "^8.0.3",
         "rimraf": "^5.0.1",
         "semantic-release": "^23.1.1",
@@ -76,7 +76,7 @@
         "typedoc-plugin-markdown": "^4.0.0-next.55",
         "typedoc-plugin-mdn-links": "^3.1.24",
         "typedoc-vitepress-theme": "1.0.0-next.10",
-        "typescript": "^5.2.2",
+        "typescript": "^5.4.5",
         "vite-node": "^1.4.0",
         "vitepress": "^1.1.4",
         "vitest": "^1.4.0",
@@ -1693,9 +1693,9 @@
       }
     },
     "node_modules/@eslint/eslintrc": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-2.1.3.tgz",
-      "integrity": "sha512-yZzuIG+jnVu6hNSzFEN07e8BxF3uAzYtQb6uDkaYZLo6oYZDCq454c5kB8zxnzfCYyP4MIuyBn10L0DqwujTmA==",
+      "version": "2.1.4",
+      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-2.1.4.tgz",
+      "integrity": "sha512-269Z39MS6wVJtsoUl10L60WdkhJVdPG24Q4eZTH3nnF6lpvSShEK3wQjDX9JRWAUPvPh7COouPpU9IrqaZFvtQ==",
       "dev": true,
       "dependencies": {
         "ajv": "^6.12.4",
@@ -1738,9 +1738,9 @@
       "dev": true
     },
     "node_modules/@eslint/js": {
-      "version": "8.54.0",
-      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.54.0.tgz",
-      "integrity": "sha512-ut5V+D+fOoWPgGGNj83GGjnntO39xDy6DWxO0wb7Jp3DcMX0TfIqdzHF85VTQkerdyGmuuMD9AKAo5KiNlf/AQ==",
+      "version": "8.57.0",
+      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.57.0.tgz",
+      "integrity": "sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==",
       "dev": true,
       "engines": {
         "node": "^12.22.0 || ^14.17.0 || >=16.0.0"
@@ -1779,13 +1779,13 @@
       }
     },
     "node_modules/@humanwhocodes/config-array": {
-      "version": "0.11.13",
-      "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.13.tgz",
-      "integrity": "sha512-JSBDMiDKSzQVngfRjOdFXgFfklaXI4K9nLF49Auh21lmBWRLIK3+xTErTWD4KU54pb6coM6ESE7Awz/FNU3zgQ==",
+      "version": "0.11.14",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz",
+      "integrity": "sha512-3T8LkOmg45BV5FICb15QQMsyUSWrQ8AygVfC7ZG32zOalnqrilm018ZVCw0eapXux8FtA33q8PSRSstjee3jSg==",
       "dev": true,
       "dependencies": {
-        "@humanwhocodes/object-schema": "^2.0.1",
-        "debug": "^4.1.1",
+        "@humanwhocodes/object-schema": "^2.0.2",
+        "debug": "^4.3.1",
         "minimatch": "^3.0.5"
       },
       "engines": {
@@ -1806,9 +1806,9 @@
       }
     },
     "node_modules/@humanwhocodes/object-schema": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.1.tgz",
-      "integrity": "sha512-dvuCeX5fC9dXgJn9t+X5atfmgQAzUOWqS1254Gh0m6i8wKd10ebXkfNKiRK+1GWi/yTvvLDHpoxLr0xxxeslWw==",
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz",
+      "integrity": "sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==",
       "dev": true
     },
     "node_modules/@isaacs/cliui": {
@@ -3361,12 +3361,6 @@
       "integrity": "sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w==",
       "dev": true
     },
-    "node_modules/@types/json-schema": {
-      "version": "7.0.15",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
-      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
-      "dev": true
-    },
     "node_modules/@types/json5": {
       "version": "0.0.29",
       "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz",
@@ -3514,33 +3508,31 @@
       "dev": true
     },
     "node_modules/@typescript-eslint/eslint-plugin": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-6.12.0.tgz",
-      "integrity": "sha512-XOpZ3IyJUIV1b15M7HVOpgQxPPF7lGXgsfcEIu3yDxFPaf/xZKt7s9QO/pbk7vpWQyVulpJbu4E5LwpZiQo4kA==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.10.0.tgz",
+      "integrity": "sha512-PzCr+a/KAef5ZawX7nbyNwBDtM1HdLIT53aSA2DDlxmxMngZ43O8SIePOeX8H5S+FHXeI6t97mTt/dDdzY4Fyw==",
       "dev": true,
       "dependencies": {
-        "@eslint-community/regexpp": "^4.5.1",
-        "@typescript-eslint/scope-manager": "6.12.0",
-        "@typescript-eslint/type-utils": "6.12.0",
-        "@typescript-eslint/utils": "6.12.0",
-        "@typescript-eslint/visitor-keys": "6.12.0",
-        "debug": "^4.3.4",
+        "@eslint-community/regexpp": "^4.10.0",
+        "@typescript-eslint/scope-manager": "7.10.0",
+        "@typescript-eslint/type-utils": "7.10.0",
+        "@typescript-eslint/utils": "7.10.0",
+        "@typescript-eslint/visitor-keys": "7.10.0",
         "graphemer": "^1.4.0",
-        "ignore": "^5.2.4",
+        "ignore": "^5.3.1",
         "natural-compare": "^1.4.0",
-        "semver": "^7.5.4",
-        "ts-api-utils": "^1.0.1"
+        "ts-api-utils": "^1.3.0"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "@typescript-eslint/parser": "^6.0.0 || ^6.0.0-alpha",
-        "eslint": "^7.0.0 || ^8.0.0"
+        "@typescript-eslint/parser": "^7.0.0",
+        "eslint": "^8.56.0"
       },
       "peerDependenciesMeta": {
         "typescript": {
@@ -3549,26 +3541,26 @@
       }
     },
     "node_modules/@typescript-eslint/parser": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-6.12.0.tgz",
-      "integrity": "sha512-s8/jNFPKPNRmXEnNXfuo1gemBdVmpQsK1pcu+QIvuNJuhFzGrpD7WjOcvDc/+uEdfzSYpNu7U/+MmbScjoQ6vg==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-7.10.0.tgz",
+      "integrity": "sha512-2EjZMA0LUW5V5tGQiaa2Gys+nKdfrn2xiTIBLR4fxmPmVSvgPcKNW+AE/ln9k0A4zDUti0J/GZXMDupQoI+e1w==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/scope-manager": "6.12.0",
-        "@typescript-eslint/types": "6.12.0",
-        "@typescript-eslint/typescript-estree": "6.12.0",
-        "@typescript-eslint/visitor-keys": "6.12.0",
+        "@typescript-eslint/scope-manager": "7.10.0",
+        "@typescript-eslint/types": "7.10.0",
+        "@typescript-eslint/typescript-estree": "7.10.0",
+        "@typescript-eslint/visitor-keys": "7.10.0",
         "debug": "^4.3.4"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "eslint": "^7.0.0 || ^8.0.0"
+        "eslint": "^8.56.0"
       },
       "peerDependenciesMeta": {
         "typescript": {
@@ -3577,16 +3569,16 @@
       }
     },
     "node_modules/@typescript-eslint/scope-manager": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-6.12.0.tgz",
-      "integrity": "sha512-5gUvjg+XdSj8pcetdL9eXJzQNTl3RD7LgUiYTl8Aabdi8hFkaGSYnaS6BLc0BGNaDH+tVzVwmKtWvu0jLgWVbw==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-7.10.0.tgz",
+      "integrity": "sha512-7L01/K8W/VGl7noe2mgH0K7BE29Sq6KAbVmxurj8GGaPDZXPr8EEQ2seOeAS+mEV9DnzxBQB6ax6qQQ5C6P4xg==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "6.12.0",
-        "@typescript-eslint/visitor-keys": "6.12.0"
+        "@typescript-eslint/types": "7.10.0",
+        "@typescript-eslint/visitor-keys": "7.10.0"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
@@ -3594,25 +3586,25 @@
       }
     },
     "node_modules/@typescript-eslint/type-utils": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-6.12.0.tgz",
-      "integrity": "sha512-WWmRXxhm1X8Wlquj+MhsAG4dU/Blvf1xDgGaYCzfvStP2NwPQh6KBvCDbiOEvaE0filhranjIlK/2fSTVwtBng==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-7.10.0.tgz",
+      "integrity": "sha512-D7tS4WDkJWrVkuzgm90qYw9RdgBcrWmbbRkrLA4d7Pg3w0ttVGDsvYGV19SH8gPR5L7OtcN5J1hTtyenO9xE9g==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/typescript-estree": "6.12.0",
-        "@typescript-eslint/utils": "6.12.0",
+        "@typescript-eslint/typescript-estree": "7.10.0",
+        "@typescript-eslint/utils": "7.10.0",
         "debug": "^4.3.4",
-        "ts-api-utils": "^1.0.1"
+        "ts-api-utils": "^1.3.0"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "eslint": "^7.0.0 || ^8.0.0"
+        "eslint": "^8.56.0"
       },
       "peerDependenciesMeta": {
         "typescript": {
@@ -3621,12 +3613,12 @@
       }
     },
     "node_modules/@typescript-eslint/types": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-6.12.0.tgz",
-      "integrity": "sha512-MA16p/+WxM5JG/F3RTpRIcuOghWO30//VEOvzubM8zuOOBYXsP+IfjoCXXiIfy2Ta8FRh9+IO9QLlaFQUU+10Q==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-7.10.0.tgz",
+      "integrity": "sha512-7fNj+Ya35aNyhuqrA1E/VayQX9Elwr8NKZ4WueClR3KwJ7Xx9jcCdOrLW04h51de/+gNbyFMs+IDxh5xIwfbNg==",
       "dev": true,
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
@@ -3634,21 +3626,22 @@
       }
     },
     "node_modules/@typescript-eslint/typescript-estree": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-6.12.0.tgz",
-      "integrity": "sha512-vw9E2P9+3UUWzhgjyyVczLWxZ3GuQNT7QpnIY3o5OMeLO/c8oHljGc8ZpryBMIyympiAAaKgw9e5Hl9dCWFOYw==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-7.10.0.tgz",
+      "integrity": "sha512-LXFnQJjL9XIcxeVfqmNj60YhatpRLt6UhdlFwAkjNc6jSUlK8zQOl1oktAP8PlWFzPQC1jny/8Bai3/HPuvN5g==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "6.12.0",
-        "@typescript-eslint/visitor-keys": "6.12.0",
+        "@typescript-eslint/types": "7.10.0",
+        "@typescript-eslint/visitor-keys": "7.10.0",
         "debug": "^4.3.4",
         "globby": "^11.1.0",
         "is-glob": "^4.0.3",
-        "semver": "^7.5.4",
-        "ts-api-utils": "^1.0.1"
+        "minimatch": "^9.0.4",
+        "semver": "^7.6.0",
+        "ts-api-utils": "^1.3.0"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
@@ -3660,42 +3653,63 @@
         }
       }
     },
+    "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dev": true,
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
+      "version": "9.0.4",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
+      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
+      "dev": true,
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
     "node_modules/@typescript-eslint/utils": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-6.12.0.tgz",
-      "integrity": "sha512-LywPm8h3tGEbgfyjYnu3dauZ0U7R60m+miXgKcZS8c7QALO9uWJdvNoP+duKTk2XMWc7/Q3d/QiCuLN9X6SWyQ==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-7.10.0.tgz",
+      "integrity": "sha512-olzif1Fuo8R8m/qKkzJqT7qwy16CzPRWBvERS0uvyc+DHd8AKbO4Jb7kpAvVzMmZm8TrHnI7hvjN4I05zow+tg==",
       "dev": true,
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.4.0",
-        "@types/json-schema": "^7.0.12",
-        "@types/semver": "^7.5.0",
-        "@typescript-eslint/scope-manager": "6.12.0",
-        "@typescript-eslint/types": "6.12.0",
-        "@typescript-eslint/typescript-estree": "6.12.0",
-        "semver": "^7.5.4"
+        "@typescript-eslint/scope-manager": "7.10.0",
+        "@typescript-eslint/types": "7.10.0",
+        "@typescript-eslint/typescript-estree": "7.10.0"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "eslint": "^7.0.0 || ^8.0.0"
+        "eslint": "^8.56.0"
       }
     },
     "node_modules/@typescript-eslint/visitor-keys": {
-      "version": "6.12.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-6.12.0.tgz",
-      "integrity": "sha512-rg3BizTZHF1k3ipn8gfrzDXXSFKyOEB5zxYXInQ6z0hUvmQlhaZQzK+YmHmNViMA9HzW5Q9+bPPt90bU6GQwyw==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-7.10.0.tgz",
+      "integrity": "sha512-9ntIVgsi6gg6FIq9xjEO4VQJvwOqA3jaBFQJ/6TK5AvEup2+cECI6Fh7QiBxmfMHXU0V0J4RyPeOU1VDNzl9cg==",
       "dev": true,
       "dependencies": {
-        "@typescript-eslint/types": "6.12.0",
-        "eslint-visitor-keys": "^3.4.1"
+        "@typescript-eslint/types": "7.10.0",
+        "eslint-visitor-keys": "^3.4.3"
       },
       "engines": {
-        "node": "^16.0.0 || >=18.0.0"
+        "node": "^18.18.0 || >=20.0.0"
       },
       "funding": {
         "type": "opencollective",
@@ -4739,15 +4753,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/builtins": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/builtins/-/builtins-5.0.1.tgz",
-      "integrity": "sha512-qwVpFEHNfhYJIzNRBvd2C1kyo6jz3ZSMPyyuR47OPdiKWlbYnZNyDWuyR175qDnAJLiCo5fBBqPb3RiXgWlkOQ==",
-      "dev": true,
-      "dependencies": {
-        "semver": "^7.0.0"
-      }
-    },
     "node_modules/bytes": {
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@@ -5919,6 +5924,19 @@
       "integrity": "sha512-5U0rVMU5Y2n2+ykNLQqMoqklN9ICBT/KsvC1Gz6vqHbz2AXXGkG+Pm5rMWk/8Vjrr/mY9985Hi8DYzn1F09Nyw==",
       "dev": true
     },
+    "node_modules/enhanced-resolve": {
+      "version": "5.16.1",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.16.1.tgz",
+      "integrity": "sha512-4U5pNsuDl0EhuZpq46M5xPslstkviJuhrdobaRDBk2Jy2KO37FDAJl4lb2KlNabxT0m4MTK2UHNrsAcphE8nyw==",
+      "dev": true,
+      "dependencies": {
+        "graceful-fs": "^4.2.4",
+        "tapable": "^2.2.0"
+      },
+      "engines": {
+        "node": ">=10.13.0"
+      }
+    },
     "node_modules/entities": {
       "version": "4.5.0",
       "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
@@ -6256,16 +6274,16 @@
       }
     },
     "node_modules/eslint": {
-      "version": "8.54.0",
-      "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.54.0.tgz",
-      "integrity": "sha512-NY0DfAkM8BIZDVl6PgSa1ttZbx3xHgJzSNJKYcQglem6CppHyMhRIQkBVSSMaSRnLhig3jsDbEzOjwCVt4AmmA==",
+      "version": "8.57.0",
+      "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.0.tgz",
+      "integrity": "sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==",
       "dev": true,
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.2.0",
         "@eslint-community/regexpp": "^4.6.1",
-        "@eslint/eslintrc": "^2.1.3",
-        "@eslint/js": "8.54.0",
-        "@humanwhocodes/config-array": "^0.11.13",
+        "@eslint/eslintrc": "^2.1.4",
+        "@eslint/js": "8.57.0",
+        "@humanwhocodes/config-array": "^0.11.14",
         "@humanwhocodes/module-importer": "^1.0.1",
         "@nodelib/fs.walk": "^1.2.8",
         "@ungap/structured-clone": "^1.2.0",
@@ -6311,10 +6329,13 @@
       }
     },
     "node_modules/eslint-compat-utils": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/eslint-compat-utils/-/eslint-compat-utils-0.1.2.tgz",
-      "integrity": "sha512-Jia4JDldWnFNIru1Ehx1H5s9/yxiRHY/TimCuUc0jNexew3cF1gI6CYZil1ociakfWO3rRqFjl1mskBblB3RYg==",
+      "version": "0.5.0",
+      "resolved": "https://registry.npmjs.org/eslint-compat-utils/-/eslint-compat-utils-0.5.0.tgz",
+      "integrity": "sha512-dc6Y8tzEcSYZMHa+CMPLi/hyo1FzNeonbhJL7Ol0ccuKQkwopJcJBA9YL/xmMTLU1eKigXo9vj9nALElWYSowg==",
       "dev": true,
+      "dependencies": {
+        "semver": "^7.5.4"
+      },
       "engines": {
         "node": ">=12"
       },
@@ -6369,14 +6390,14 @@
       }
     },
     "node_modules/eslint-plugin-es-x": {
-      "version": "7.4.0",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-es-x/-/eslint-plugin-es-x-7.4.0.tgz",
-      "integrity": "sha512-WJa3RhYzBtl8I37ebY9p76s61UhZyi4KaFOnX2A5r32RPazkXj5yoT6PGnD02dhwzEUj0KwsUdqfKDd/OuvGsw==",
+      "version": "7.6.0",
+      "resolved": "https://registry.npmjs.org/eslint-plugin-es-x/-/eslint-plugin-es-x-7.6.0.tgz",
+      "integrity": "sha512-I0AmeNgevgaTR7y2lrVCJmGYF0rjoznpDvqV/kIkZSZbZ8Rw3eu4cGlvBBULScfkSOCzqKbff5LR4CNrV7mZHA==",
       "dev": true,
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.1.2",
         "@eslint-community/regexpp": "^4.6.0",
-        "eslint-compat-utils": "^0.1.2"
+        "eslint-compat-utils": "^0.5.0"
       },
       "engines": {
         "node": "^14.18.0 || >=16.0.0"
@@ -6473,30 +6494,64 @@
       }
     },
     "node_modules/eslint-plugin-n": {
-      "version": "16.3.1",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-n/-/eslint-plugin-n-16.3.1.tgz",
-      "integrity": "sha512-w46eDIkxQ2FaTHcey7G40eD+FhTXOdKudDXPUO2n9WNcslze/i/HT2qJ3GXjHngYSGDISIgPNhwGtgoix4zeOw==",
+      "version": "17.7.0",
+      "resolved": "https://registry.npmjs.org/eslint-plugin-n/-/eslint-plugin-n-17.7.0.tgz",
+      "integrity": "sha512-4Jg4ZKVE4VjHig2caBqPHYNW5na84RVufUuipFLJbgM/G57O6FdpUKJbHakCDJb/yjQuyqVzYWRtU3HNYaZUwg==",
       "dev": true,
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.4.0",
-        "builtins": "^5.0.1",
-        "eslint-plugin-es-x": "^7.1.0",
+        "enhanced-resolve": "^5.15.0",
+        "eslint-plugin-es-x": "^7.5.0",
         "get-tsconfig": "^4.7.0",
+        "globals": "^15.0.0",
         "ignore": "^5.2.4",
-        "is-builtin-module": "^3.2.1",
-        "is-core-module": "^2.12.1",
-        "minimatch": "^3.1.2",
-        "resolve": "^1.22.2",
+        "minimatch": "^9.0.0",
         "semver": "^7.5.3"
       },
       "engines": {
-        "node": ">=16.0.0"
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
       },
       "funding": {
-        "url": "https://github.com/sponsors/mysticatea"
+        "url": "https://opencollective.com/eslint"
       },
       "peerDependencies": {
-        "eslint": ">=7.0.0"
+        "eslint": ">=8.23.0"
+      }
+    },
+    "node_modules/eslint-plugin-n/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dev": true,
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/eslint-plugin-n/node_modules/globals": {
+      "version": "15.3.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-15.3.0.tgz",
+      "integrity": "sha512-cCdyVjIUVTtX8ZsPkq1oCsOsLmGIswqnjZYMJJTGaNApj1yHtLSymKhwH51ttirREn75z3p4k051clwg7rvNKA==",
+      "dev": true,
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/eslint-plugin-n/node_modules/minimatch": {
+      "version": "9.0.4",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
+      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
+      "dev": true,
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/eslint-scope": {
@@ -7519,9 +7574,9 @@
       }
     },
     "node_modules/globals": {
-      "version": "13.23.0",
-      "resolved": "https://registry.npmjs.org/globals/-/globals-13.23.0.tgz",
-      "integrity": "sha512-XAmF0RjlrjY23MA51q3HltdlGxUpXPvg0GioKiD9X6HD28iMjo2dKC8Vqwm7lne4GNr78+RHTfliktR6ZH09wA==",
+      "version": "13.24.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-13.24.0.tgz",
+      "integrity": "sha512-AhO5QUcj8llrbG09iWhPU2B204J1xnPeL8kQmVorSsy+Sjj1sk8gIyh6cUocGmH4L0UuhAJy+hJMRA4mgA4mFQ==",
       "dev": true,
       "dependencies": {
         "type-fest": "^0.20.2"
@@ -14853,6 +14908,15 @@
       "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==",
       "dev": true
     },
+    "node_modules/tapable": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz",
+      "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/tar": {
       "version": "6.2.0",
       "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.0.tgz",
@@ -15096,12 +15160,12 @@
       }
     },
     "node_modules/ts-api-utils": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-1.0.3.tgz",
-      "integrity": "sha512-wNMeqtMz5NtwpT/UZGY5alT+VoKdSsOOP/kqHFcUW1P/VRhH2wJ48+DN2WwUliNbQ976ETwDL0Ifd2VVvgonvg==",
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-1.3.0.tgz",
+      "integrity": "sha512-UQMIo7pb8WRomKR1/+MFVLTroIvDVtMX3K6OUir8ynLyzB8Jeriont2bTAtmNPa1ekAgN7YPDyf6V+ygrdU+eQ==",
       "dev": true,
       "engines": {
-        "node": ">=16.13.0"
+        "node": ">=16"
       },
       "peerDependencies": {
         "typescript": ">=4.2.0"
@@ -15344,9 +15408,9 @@
       }
     },
     "node_modules/typescript": {
-      "version": "5.2.2",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz",
-      "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==",
+      "version": "5.4.5",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz",
+      "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
       "dev": true,
       "bin": {
         "tsc": "bin/tsc",
diff --git a/package.json b/package.json
index bb8fd13a..85f48943 100644
--- a/package.json
+++ b/package.json
@@ -129,15 +129,15 @@
     "@types/validate-npm-package-name": "^4.0.2",
     "@types/which": "^3.0.0",
     "@types/yargs": "^17.0.24",
-    "@typescript-eslint/eslint-plugin": "^6.3.0",
-    "@typescript-eslint/parser": "^6.3.0",
+    "@typescript-eslint/eslint-plugin": "^7.10.0",
+    "@typescript-eslint/parser": "^7.10.0",
     "@vitest/coverage-v8": "^1.4.0",
     "@vitest/ui": "^1.4.0",
     "copyfiles": "^2.4.1",
     "eslint": "^8.46.0",
     "eslint-plugin-import": "^2.28.0",
     "eslint-plugin-jsdoc": "^46.9.0",
-    "eslint-plugin-n": "^16.3.1",
+    "eslint-plugin-n": "^17.7.0",
     "husky": "^8.0.3",
     "rimraf": "^5.0.1",
     "semantic-release": "^23.1.1",
@@ -146,7 +146,7 @@
     "typedoc-plugin-markdown": "^4.0.0-next.55",
     "typedoc-plugin-mdn-links": "^3.1.24",
     "typedoc-vitepress-theme": "1.0.0-next.10",
-    "typescript": "^5.2.2",
+    "typescript": "^5.4.5",
     "vite-node": "^1.4.0",
     "vitepress": "^1.1.4",
     "vitest": "^1.4.0",

From 3da99a06ea6abf7323252ab89f07803f04748337 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 25 May 2024 22:19:11 +0300
Subject: [PATCH 09/39] feat: add action recommendations to the electron
 example

---
 .../electron/index.ts                         | 11 ++-
 .../electron/state/llmState.ts                | 16 +++-
 .../electron-typescript-react/package.json    | 16 ++--
 .../electron-typescript-react/src/App/App.css | 80 +++++++++++++++++++
 .../electron-typescript-react/src/App/App.tsx | 33 +++++++-
 .../src/App/components/InputRow/InputRow.css  |  4 +
 .../src/App/components/InputRow/InputRow.tsx  | 15 ++--
 .../src/icons/DownloadIconSVG.tsx             |  7 ++
 .../src/icons/SearchIconSVG.tsx               |  7 ++
 .../src/icons/StarIconSVG.tsx                 |  7 ++
 .../electron-typescript-react/src/index.css   | 15 +++-
 11 files changed, 188 insertions(+), 23 deletions(-)
 create mode 100644 templates/electron-typescript-react/src/icons/DownloadIconSVG.tsx
 create mode 100644 templates/electron-typescript-react/src/icons/SearchIconSVG.tsx
 create mode 100644 templates/electron-typescript-react/src/icons/StarIconSVG.tsx

diff --git a/templates/electron-typescript-react/electron/index.ts b/templates/electron-typescript-react/electron/index.ts
index aa6390b2..0df4995c 100644
--- a/templates/electron-typescript-react/electron/index.ts
+++ b/templates/electron-typescript-react/electron/index.ts
@@ -1,6 +1,6 @@
 import {fileURLToPath} from "node:url";
 import path from "node:path";
-import {app, BrowserWindow} from "electron";
+import {app, shell, BrowserWindow} from "electron";
 import {registerLlmRpc} from "./rpc/llmRpc.ts";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -35,6 +35,15 @@ function createWindow() {
     });
     registerLlmRpc(win);
 
+    // open external links in the default browser
+    win.webContents.setWindowOpenHandler(({url}) => {
+        if (url.startsWith("file://"))
+            return {action: "allow"};
+
+        void shell.openExternal(url);
+        return {action: "deny"};
+    });
+
     // Test active push message to Renderer-process.
     win.webContents.on("did-finish-load", () => {
         win?.webContents.send("main-process-message", (new Date).toLocaleString());
diff --git a/templates/electron-typescript-react/electron/state/llmState.ts b/templates/electron-typescript-react/electron/state/llmState.ts
index 7d3e3586..330583c2 100644
--- a/templates/electron-typescript-react/electron/state/llmState.ts
+++ b/templates/electron-typescript-react/electron/state/llmState.ts
@@ -306,7 +306,11 @@ export const llmFunctions = {
                     ...llmState.state,
                     chatSession: {
                         ...llmState.state.chatSession,
-                        generatingResult: true
+                        generatingResult: true,
+                        draftPrompt: {
+                            prompt: "",
+                            completion: ""
+                        }
                     }
                 };
                 promptAbortController = new AbortController();
@@ -337,7 +341,11 @@ export const llmFunctions = {
                     chatSession: {
                         ...llmState.state.chatSession,
                         generatingResult: false,
-                        simplifiedChat: getSimplifiedChatHistory(false)
+                        simplifiedChat: getSimplifiedChatHistory(false),
+                        draftPrompt: {
+                            ...llmState.state.chatSession.draftPrompt,
+                            completion: chatSessionCompletionEngine?.complete(llmState.state.chatSession.draftPrompt.prompt) ?? ""
+                        }
                     }
                 };
                 inProgressResponse.length = 0;
@@ -380,7 +388,7 @@ export const llmFunctions = {
                     simplifiedChat: [],
                     draftPrompt: {
                         prompt: llmState.state.chatSession.draftPrompt.prompt,
-                        completion: chatSessionCompletionEngine.complete(llmState.state.chatSession.draftPrompt.prompt)
+                        completion: chatSessionCompletionEngine.complete(llmState.state.chatSession.draftPrompt.prompt) ?? ""
                     }
                 }
             };
@@ -407,7 +415,7 @@ export const llmFunctions = {
                     ...llmState.state.chatSession,
                     draftPrompt: {
                         prompt: prompt,
-                        completion: chatSessionCompletionEngine.complete(prompt)
+                        completion: chatSessionCompletionEngine.complete(prompt) ?? ""
                     }
                 }
             };
diff --git a/templates/electron-typescript-react/package.json b/templates/electron-typescript-react/package.json
index d58fe0d0..1e053ba3 100644
--- a/templates/electron-typescript-react/package.json
+++ b/templates/electron-typescript-react/package.json
@@ -21,16 +21,16 @@
     "classnames": "^2.5.1",
     "lifecycle-utils": "^1.4.1",
     "node-llama-cpp": "file:../..",
-    "react": "^18.2.0",
-    "react-dom": "^18.2.0"
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1"
   },
   "devDependencies": {
-    "@types/react": "^18.2.64",
-    "@types/react-dom": "^18.2.21",
+    "@types/react": "^18.3.3",
+    "@types/react-dom": "^18.3.0",
     "@typescript-eslint/eslint-plugin": "^7.1.1",
     "@typescript-eslint/parser": "^7.1.1",
-    "@vitejs/plugin-react": "^4.2.1",
-    "electron": "^30.0.1",
+    "@vitejs/plugin-react": "^4.3.0",
+    "electron": "^30.0.8",
     "electron-builder": "^24.13.3",
     "eslint": "^8.57.0",
     "eslint-plugin-import": "^2.29.1",
@@ -39,8 +39,8 @@
     "eslint-plugin-react-refresh": "^0.4.5",
     "rimraf": "^5.0.7",
     "typescript": "^5.2.2",
-    "vite": "^5.1.6",
-    "vite-plugin-electron": "^0.28.6",
+    "vite": "^5.2.11",
+    "vite-plugin-electron": "^0.28.7",
     "vite-plugin-electron-renderer": "^0.14.5"
   }
 }
diff --git a/templates/electron-typescript-react/src/App/App.css b/templates/electron-typescript-react/src/App/App.css
index c6f6a3a5..90fa931c 100644
--- a/templates/electron-typescript-react/src/App/App.css
+++ b/templates/electron-typescript-react/src/App/App.css
@@ -34,6 +34,86 @@
         }
 
         > .loadModel {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            gap: 64px;
+            text-align: start;
+
+            > .hint {
+                margin-top: 64px;
+                opacity: 0.6;
+            }
+
+            > .actions {
+                display: flex;
+                flex-direction: column;
+                align-items: center;
+                background-color: var(--actions-block-background-color);
+                border: solid 1px var(--actions-block-border-color);
+                box-shadow: var(--actions-block-box-shadow);
+                padding: 16px 24px;
+                border-radius: 12px;
+                gap: 16px;
+
+                > .starLink {
+                    display: flex;
+                    flex-direction: row;
+                    align-items: center;
+                    gap: 8px;
+                    color: var(--star-link-color);
+
+                    &:hover {
+                        color: var(--star-hover-color);
+                    }
+
+                    > .starIcon {
+                        flex-shrink: 0;
+                        fill: currentColor;
+                    }
+                }
+
+                > .links {
+                    display: flex;
+                    flex-direction: row;
+
+                    > a {
+                        display: flex;
+                        flex-direction: row;
+                        align-items: center;
+                        gap: 8px;
+
+                        > .downloadIcon {
+                            flex-shrink: 0;
+                            fill: currentColor;
+                        }
+                    }
+
+                    > .separator {
+                        width: 1px;
+                        background-color: var(--link-color);
+                        opacity: 0.2;
+                        margin: 0px 16px;
+                        height: 0.8lh;
+                        align-self: center;
+                    }
+                }
+
+                > .browseLink {
+                    display: flex;
+                    flex-direction: row;
+                    align-items: center;
+                    gap: 8px;
+
+                    > .searchIcon {
+                        flex-shrink: 0;
+                        fill: currentColor;
+                    }
+                }
+            }
+        }
+
+        > .typeMessage {
             opacity: 0.6;
         }
     }
diff --git a/templates/electron-typescript-react/src/App/App.tsx b/templates/electron-typescript-react/src/App/App.tsx
index a6cff083..b40419bf 100644
--- a/templates/electron-typescript-react/src/App/App.tsx
+++ b/templates/electron-typescript-react/src/App/App.tsx
@@ -2,6 +2,9 @@ import {useCallback, useLayoutEffect} from "react";
 import {llmState} from "../state/llmState.ts";
 import {electronLlmRpc} from "../rpc/llmRpc.ts";
 import {useExternalState} from "../hooks/useExternalState.ts";
+import {SearchIconSVG} from "../icons/SearchIconSVG.tsx";
+import {StarIconSVG} from "../icons/StarIconSVG.tsx";
+import {DownloadIconSVG} from "../icons/DownloadIconSVG.tsx";
 import {Header} from "./components/Header/Header.tsx";
 import {ChatHistory} from "./components/ChatHistory/ChatHistory.tsx";
 import {InputRow} from "./components/InputRow/InputRow.tsx";
@@ -97,7 +100,32 @@ export function App() {
                 {
                     (state.selectedModelFilePath == null || state.llama.error != null) &&
                     <div className="loadModel">
-                        Click the button above to load a model
+                        <div className="hint">Click the button above to load a model</div>
+                        <div className="actions">
+                            <a className="starLink" target="_blank" href="https://github.com/withcatai/node-llama-cpp">
+                                <StarIconSVG className="starIcon"/>
+                                <div className="text">
+                                    Star <code>node-llama-cpp</code> on GitHub
+                                </div>
+                            </a>
+                            <div className="links">
+                                <a target="_blank"
+                                    href="https://huggingface.co/mradermacher/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf">
+                                    <DownloadIconSVG className="downloadIcon"/>
+                                    <div className="text">Get Llama 3 8B model</div>
+                                </a>
+                                <div className="separator"/>
+                                <a target="_blank"
+                                    href="https://huggingface.co/ggml-org/gemma-1.1-2b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-2b-it.Q4_K_M.gguf">
+                                    <DownloadIconSVG className="downloadIcon"/>
+                                    <div className="text">Get Gemma 1.1 2B model</div>
+                                </a>
+                            </div>
+                            <a className="browseLink" target="_blank" href="https://huggingface.co/mradermacher">
+                                <SearchIconSVG className="searchIcon"/>
+                                <div className="text">Find more models</div>
+                            </a>
+                        </div>
                     </div>
                 }
                 {
@@ -106,7 +134,7 @@ export function App() {
                         error == null &&
                         state.chatSession.simplifiedChat.length === 0
                     ) &&
-                    <div className="loadModel">
+                    <div className="typeMessage">
                         Type a message to start the conversation
                     </div>
                 }
@@ -120,6 +148,7 @@ export function App() {
             />
         }
         <InputRow
+            disabled={!state.model.loaded}
             stopGeneration={
                 generatingResult
                     ? stopActivePrompt
diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
index 4a9bb3f5..02a3d3de 100644
--- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
+++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
@@ -13,6 +13,10 @@
     z-index: 10;
     align-items: flex-end;
 
+    &.disabled {
+        opacity: 0.48;
+    }
+
     > .inputContainer {
         flex: 1;
         display: flex;
diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
index a8ebdb37..62b91e0d 100644
--- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
+++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.tsx
@@ -7,7 +7,8 @@ import "./InputRow.css";
 
 
 export function InputRow({
-    stopGeneration, sendPrompt, onPromptInput, autocompleteInputDraft, autocompleteCompletion, generatingResult, contextSequenceLoaded
+    disabled = false, stopGeneration, sendPrompt, onPromptInput, autocompleteInputDraft, autocompleteCompletion, generatingResult,
+    contextSequenceLoaded
 }: InputRowProps) {
     const [inputText, setInputText] = useState<string>("");
     const inputRef = useRef<HTMLTextAreaElement>(null);
@@ -54,8 +55,8 @@ export function InputRow({
 
         setInputValue("");
         resizeInput();
-        sendPrompt(message);
         onPromptInput?.("");
+        sendPrompt(message);
     }, [setInputValue, generatingResult, resizeInput, sendPrompt, onPromptInput]);
 
     const onInput = useCallback(() => {
@@ -73,7 +74,6 @@ export function InputRow({
         if (event.key === "Enter" && !event.shiftKey) {
             event.preventDefault();
             submitPrompt();
-            resizeInput();
         } else if (event.key === "Tab" && !event.shiftKey && !event.ctrlKey && !event.metaKey && !event.altKey) {
             event.preventDefault();
             if (inputRef.current != null && autocompleteText !== "") {
@@ -86,7 +86,7 @@ export function InputRow({
         }
     }, [submitPrompt, setInputValue, onPromptInput, resizeInput, autocompleteText]);
 
-    return <div className="appInputRow">
+    return <div className={classNames("appInputRow", disabled && "disabled")}>
         <div className="inputContainer">
             <textarea
                 ref={inputRef}
@@ -95,7 +95,7 @@ export function InputRow({
                 className="input"
                 autoComplete="off"
                 spellCheck
-                disabled={!contextSequenceLoaded}
+                disabled={disabled || !contextSequenceLoaded}
                 onScroll={resizeInput}
                 placeholder={
                     autocompleteText === ""
@@ -113,14 +113,14 @@ export function InputRow({
         </div>
         <button
             className="stopGenerationButton"
-            disabled={stopGeneration == null || !generatingResult}
+            disabled={disabled || stopGeneration == null || !generatingResult}
             onClick={stopGeneration}
         >
             <AbortIconSVG className="icon" />
         </button>
         <button
             className="sendButton"
-            disabled={!contextSequenceLoaded || inputText === "" || generatingResult}
+            disabled={disabled || !contextSequenceLoaded || inputText === "" || generatingResult}
             onClick={submitPrompt}
         >
             <AddMessageIconSVG className="icon" />
@@ -129,6 +129,7 @@ export function InputRow({
 }
 
 type InputRowProps = {
+    disabled?: boolean,
     stopGeneration?(): void,
     sendPrompt(prompt: string): void,
     onPromptInput?(currentText: string): void,
diff --git a/templates/electron-typescript-react/src/icons/DownloadIconSVG.tsx b/templates/electron-typescript-react/src/icons/DownloadIconSVG.tsx
new file mode 100644
index 00000000..7533c5d1
--- /dev/null
+++ b/templates/electron-typescript-react/src/icons/DownloadIconSVG.tsx
@@ -0,0 +1,7 @@
+import {SVGProps} from "react";
+
+export function DownloadIconSVG(props: SVGProps<SVGSVGElement>) {
+    return <svg height="24px" viewBox="0 -960 960 960" width="24px" {...props}>
+        <path d="M480-320 280-520l56-58 104 104v-326h80v326l104-104 56 58-200 200ZM240-160q-33 0-56.5-23.5T160-240v-120h80v120h480v-120h80v120q0 33-23.5 56.5T720-160H240Z"/>
+    </svg>;
+}
diff --git a/templates/electron-typescript-react/src/icons/SearchIconSVG.tsx b/templates/electron-typescript-react/src/icons/SearchIconSVG.tsx
new file mode 100644
index 00000000..48822e3e
--- /dev/null
+++ b/templates/electron-typescript-react/src/icons/SearchIconSVG.tsx
@@ -0,0 +1,7 @@
+import {SVGProps} from "react";
+
+export function SearchIconSVG(props: SVGProps<SVGSVGElement>) {
+    return <svg height="24px" viewBox="0 -960 960 960" width="24px" {...props}>
+        <path d="M380-320q-109 0-184.5-75.5T120-580q0-109 75.5-184.5T380-840q109 0 184.5 75.5T640-580q0 44-14 83t-38 69l224 224q11 11 11 28t-11 28q-11 11-28 11t-28-11L532-372q-30 24-69 38t-83 14Zm0-80q75 0 127.5-52.5T560-580q0-75-52.5-127.5T380-760q-75 0-127.5 52.5T200-580q0 75 52.5 127.5T380-400Z"/>
+    </svg>;
+}
diff --git a/templates/electron-typescript-react/src/icons/StarIconSVG.tsx b/templates/electron-typescript-react/src/icons/StarIconSVG.tsx
new file mode 100644
index 00000000..5c0a6e97
--- /dev/null
+++ b/templates/electron-typescript-react/src/icons/StarIconSVG.tsx
@@ -0,0 +1,7 @@
+import {SVGProps} from "react";
+
+export function StarIconSVG(props: SVGProps<SVGSVGElement>) {
+    return <svg height="24px" viewBox="0 -960 960 960" width="24px" {...props}>
+        <path d="m354-287 126-76 126 77-33-144 111-96-146-13-58-136-58 135-146 13 111 97-33 143Zm126 18L314-169q-11 7-23 6t-21-8q-9-7-14-17.5t-2-23.5l44-189-147-127q-10-9-12.5-20.5T140-571q4-11 12-18t22-9l194-17 75-178q5-12 15.5-18t21.5-6q11 0 21.5 6t15.5 18l75 178 194 17q14 2 22 9t12 18q4 11 1.5 22.5T809-528L662-401l44 189q3 13-2 23.5T690-171q-9 7-21 8t-23-6L480-269Zm0-201Z"/>
+    </svg>;
+}
diff --git a/templates/electron-typescript-react/src/index.css b/templates/electron-typescript-react/src/index.css
index f7c3176d..9163aaec 100644
--- a/templates/electron-typescript-react/src/index.css
+++ b/templates/electron-typescript-react/src/index.css
@@ -24,18 +24,24 @@
 
     --link-color: light-dark(#646cff, #646cff);
     --link-hover-color: light-dark(#747bff, #535bf2);
+    --star-link-color: light-dark(#e09c1c, #eac54f);
+    --star-hover-color: light-dark(#daaa52, #c7a027);
 
     --panel-background-color: light-dark(rgba(0 0 0 / 52%), rgba(0 0 0 / 48%));
     --panel-text-color: light-dark(#ffffff, #eeeeee);
     --panel-button-background-color: light-dark(rgba(0 0 0 / 24%), rgba(255 255 255 / 8%));
     --panel-button-hover-border-color: light-dark(var(--panel-text-color), var(--button-hover-border-color));
     --panel-progress-color: light-dark(rgba(255 255 255 / 8%), rgba(255 255 255 / 4%));
-    --panel-box-shadow: 0px 4px 8px 0px rgba(0, 0, 0, 0.08), 0px 6px 24px 0px rgba(0, 0, 0, 0.16);
+    --panel-box-shadow: 0px 4px 8px 0px rgba(0 0 0 / 8%), 0px 6px 24px 0px rgba(0 0 0 / 16%);
 
     --error-border-color: light-dark(rgba(239 83 80 / 100%), rgba(239 83 80 / 100%));
 
     --user-message-background-color: light-dark(rgba(0 0 0 / 16%), rgba(0 0 0 / 48%));
     --user-message-text-color: light-dark(#242424, var(--text-color));
+
+    --actions-block-background-color: light-dark(rgba(0 0 0 / 4%), rgba(0 0 0 / 16%));
+    --actions-block-border-color: light-dark(rgba(0 0 0 / 4%), rgba(0 0 0 / 16%));
+    --actions-block-box-shadow: 0px 4px 8px 0px rgba(0 0 0 / 2%), 0px 6px 24px 0px rgba(0 0 0 / 8%);
 }
 
 body {
@@ -55,6 +61,13 @@ a {
     }
 }
 
+code {
+    background-color: color-mix(in srgb, currentColor, transparent 84%);
+    padding: 0.1em 0.3em;
+    border-radius: 0.4em;
+    font-size: 1.2em;
+}
+
 button {
     border-radius: 8px;
     border: 1px solid transparent;

From 68395b163885afcf6f1c47c17519407013186c89 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 26 May 2024 00:15:48 +0300
Subject: [PATCH 10/39] feat: build the electron example and add it to the
 published GitHub release

---
 .github/workflows/build.yml                   | 63 ++++++++++++++-
 scripts/scaffoldElectronExampleForCiBuild.ts  | 55 +++++++++++++
 .../electron-builder.json5                    | 79 +++++++++++++++----
 3 files changed, 180 insertions(+), 17 deletions(-)
 create mode 100644 scripts/scaffoldElectronExampleForCiBuild.ts

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 24203db3..74d49e05 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -67,7 +67,7 @@ jobs:
             generators: "Ninja"
             artifact: "linux"
           - name: "macOS Clang"
-            os: macos-12
+            os: macos-13
             cc: "clang"
             cxx: "clang++"
             generators: "Xcode"
@@ -343,6 +343,8 @@ jobs:
     needs:
       - build
       - build-binaries
+    outputs:
+      package-version: ${{ steps.set-package-version.outputs.package-version }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -391,6 +393,12 @@ jobs:
           if [ -f .semanticRelease.npmPackage.deployedVersion.txt ]; then
             echo "npm-url=https://www.npmjs.com/package/node-llama-cpp/v/$(cat .semanticRelease.npmPackage.deployedVersion.txt)" >> $GITHUB_OUTPUT
           fi
+      - name: Set package version to GITHUB_OUTPUT
+        id: set-package-version
+        run: |
+          if [ -f .semanticRelease.npmPackage.deployedVersion.txt ]; then
+            echo "package-version=$(cat .semanticRelease.npmPackage.deployedVersion.txt)" >> $GITHUB_OUTPUT
+          fi
       - name: Prepare `create-node-llama-cpp` module
         if: steps.set-npm-url.outputs.npm-url != ''
         run: |
@@ -437,3 +445,56 @@ jobs:
         uses: actions/deploy-pages@v4
         with:
           artifact_name: pages-docs
+
+  build-electron-example:
+    name: Build & release Electron app example - ${{ matrix.config.name }}
+    needs:
+      - release
+    if: needs.release.outputs.package-version != ''
+    runs-on: ${{ matrix.config.os }}
+    permissions:
+      contents: write
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: "Windows"
+            os: windows-2022
+          - name: "Ubuntu"
+            os: ubuntu-22.04
+          - name: "macOS"
+            os: macos-13
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install modules
+        run: npm ci
+
+      - name: Create Electron app project
+        env:
+          DEPLOYED_PACKAGE_VERSION: ${{ needs.release.outputs.package-version }}
+        run: |
+          npx --no vite-node ./scripts/scaffoldElectronExampleForCiBuild.ts --packageVersion "$DEPLOYED_PACKAGE_VERSION" --packageFolderPath ./electron-app-example
+          cd electron-app-example
+          npm install
+
+      - name: Build electron app
+        id: build
+        shell: bash
+        timeout-minutes: 480
+        run: |
+          cd electron-app-example
+          npm run build
+          ls ./release
+
+      - name: Add builds to current release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file: "electron-app-example/release/*.{dmg,zip,exe,appx,AppImage,snap,deb,tar.gz}"
+          file_glob: true
+          tag: ${{ needs.release.outputs.package-version }}
+          make_latest: false
diff --git a/scripts/scaffoldElectronExampleForCiBuild.ts b/scripts/scaffoldElectronExampleForCiBuild.ts
new file mode 100644
index 00000000..ea9c2ffe
--- /dev/null
+++ b/scripts/scaffoldElectronExampleForCiBuild.ts
@@ -0,0 +1,55 @@
+import path from "path";
+import yargs from "yargs";
+import {hideBin} from "yargs/helpers";
+import fs from "fs-extra";
+import {ProjectTemplate, ProjectTemplateParameter, scaffoldProjectTemplate} from "../src/cli/utils/projectTemplates.js";
+import {packedProjectTemplatesDirectory} from "../src/config.js";
+
+import "./packTemplates.js";
+
+const electronTemplateName = "electron-typescript-react";
+const projectName = "node-llama-cpp-electron-example";
+
+const argv = await yargs(hideBin(process.argv))
+    .option("packageVersion", {
+        type: "string",
+        demandOption: true
+    })
+    .option("packageFolderPath", {
+        type: "string",
+        demandOption: true
+    })
+    .argv;
+
+const {packageVersion, packageFolderPath} = argv;
+if (packageVersion === "")
+    throw new Error("packageVersion is empty");
+
+const resolvedPackageFolderPath = path.resolve(process.cwd(), packageFolderPath);
+
+const templateFilePath = path.join(packedProjectTemplatesDirectory, `${electronTemplateName}.json`);
+if (!(await fs.pathExists(templateFilePath)))
+    throw new Error(`Template file was not found for template "${electronTemplateName}"`);
+
+const template: ProjectTemplate = await fs.readJSON(templateFilePath);
+
+await scaffoldProjectTemplate({
+    template,
+    directoryPath: resolvedPackageFolderPath,
+    parameters: {
+        [ProjectTemplateParameter.ProjectName]: projectName,
+        [ProjectTemplateParameter.ModelUrl]: "https://github.com/withcatai/node-llama-cpp",
+        [ProjectTemplateParameter.ModelFilename]: "model.gguf",
+        [ProjectTemplateParameter.CurrentModuleVersion]: packageVersion
+    }
+});
+
+const packageJsonPath = path.join(resolvedPackageFolderPath, "package.json");
+const packageJson = await fs.readJson(packageJsonPath);
+packageJson.version = packageVersion;
+delete packageJson.scripts.postinstall;
+delete packageJson.scripts["models:pull"];
+
+await fs.writeJson(packageJsonPath, packageJson, {spaces: 2});
+
+console.info(`Scaffolded ${projectName} in ${resolvedPackageFolderPath} with package version ${packageVersion}`);
diff --git a/templates/electron-typescript-react/electron-builder.json5 b/templates/electron-typescript-react/electron-builder.json5
index d2c73cd4..2ebc0f59 100644
--- a/templates/electron-typescript-react/electron-builder.json5
+++ b/templates/electron-typescript-react/electron-builder.json5
@@ -1,11 +1,12 @@
 // @see - https://www.electron.build/configuration/configuration
 {
   "$schema": "https://raw.githubusercontent.com/electron-userland/electron-builder/master/packages/app-builder-lib/scheme.json",
-  "appId": "YourAppID",
+  "appId": "node-llama-cpp.electron.example",
   "asar": true,
-  "productName": "YourAppName",
+  "productName": "node-llama-cpp Electron example",
+  "executableName": "node-llama-cpp-electron-example",
   "directories": {
-    "output": "release/${version}"
+    "output": "release"
   },
   "files": [
     "dist",
@@ -20,32 +21,78 @@
     "node_modules/node-llama-cpp/llama/localBuilds"
   ],
   "mac": {
-    "target": [
-      "dmg"
-    ],
-    "artifactName": "${productName}-Mac-${version}-Installer.${ext}"
+    "target": [{
+      "target": "dmg",
+      "arch": [
+        "arm64",
+        "x64"
+      ]
+    }, {
+      "target": "zip",
+      "arch": [
+        "arm64",
+        "x64"
+      ]
+    }],
+
+    "artifactName": "${name}.macOS.${version}.${arch}.${ext}"
   },
   "win": {
     "target": [
       {
         "target": "nsis",
         "arch": [
-          "x64"
+          "x64",
+          "arm64"
+        ]
+      },
+      {
+        "target": "appx",
+        "arch": [
+          "x64",
+          "arm64"
         ]
       }
     ],
-    "artifactName": "${productName}-Windows-${version}-Setup.${ext}"
+
+    "artifactName": "${name}.Windows.${version}.${arch}.${ext}"
+  },
+  "appx": {
+    "artifactName": "${name}.Windows.${version}.${arch}.${ext}"
   },
   "nsis": {
-    "oneClick": false,
+    "oneClick": true,
     "perMachine": false,
-    "allowToChangeInstallationDirectory": true,
-    "deleteAppDataOnUninstall": false
+    "allowToChangeInstallationDirectory": false,
+    "deleteAppDataOnUninstall": true
   },
   "linux": {
-    "target": [
-      "AppImage"
-    ],
-    "artifactName": "${productName}-Linux-${version}.${ext}"
+    "target": [{
+      "target": "AppImage",
+      "arch": [
+        "x64",
+        "arm64"
+      ]
+    }, {
+      "target": "snap",
+      "arch": [
+        "x64",
+        "arm64"
+      ]
+    }, {
+      "target": "deb",
+      "arch": [
+        "x64",
+        "arm64"
+      ]
+    }, {
+      "target": "tar.gz",
+      "arch": [
+        "x64",
+        "arm64"
+      ]
+    }],
+
+    "artifactName": "${name}.Linux.${version}.${arch}.${ext}"
   }
 }

From e55c8588a2e8cce397fdc5535aa8221510109a2a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:17:59 +0300
Subject: [PATCH 11/39] feat: parallel function calling, chat wrapper based
 system message support

---
 src/ChatWrapper.ts                            |  175 ++-
 src/chatWrappers/ChatMLChatWrapper.ts         |   47 +-
 src/chatWrappers/FalconChatWrapper.ts         |   42 +-
 src/chatWrappers/FunctionaryChatWrapper.ts    |  243 ++--
 src/chatWrappers/GemmaChatWrapper.ts          |   58 +-
 src/chatWrappers/GeneralChatWrapper.ts        |   42 +-
 src/chatWrappers/Llama2ChatWrapper.ts         |   41 +-
 src/chatWrappers/Llama3ChatWrapper.ts         |   48 +-
 .../generic/JinjaTemplateChatWrapper.ts       |   66 +-
 .../generic/TemplateChatWrapper.ts            |   50 +-
 ...plateEquivalentToSpecializedChatWrapper.ts |   18 +-
 src/evaluator/LlamaChat/LlamaChat.ts          | 1024 +++++++++++------
 .../LlamaChat/utils/FunctionCallGrammar.ts    |  239 ----
 .../utils/FunctionCallNameGrammar.ts          |   87 ++
 .../utils/FunctionCallParamsGrammar.ts        |   72 ++
 .../utils/LlamaFunctionCallValidationError.ts |   17 +
 ...KeepFirstSystemChatContextShiftStrategy.ts |   45 +-
 .../LlamaChatSession/LlamaChatSession.ts      |  112 +-
 src/evaluator/LlamaCompletion.ts              |    5 +-
 src/index.ts                                  |    4 +-
 src/types.ts                                  |   81 +-
 src/utils/LlamaText.ts                        |  143 ++-
 src/utils/StopGenerationDetector.ts           |  133 ++-
 ...erRemovalCountToFitChatHistoryInContext.ts |   12 +-
 src/utils/truncateTextAndRoundToWords.ts      |   35 +
 ...pper.test.ts => ChatMLChatWrapper.test.ts} |   22 +-
 ...pper.test.ts => FalconChatWrapper.test.ts} |   22 +-
 .../FunctionaryChatWrapper.test.ts            |  381 ++++++
 .../chatWrappers/GemmaChatWrapper.test.ts     |   22 +-
 ...per.test.ts => GeneralChatWrapper.test.ts} |   26 +-
 ...pper.test.ts => Llama2ChatWrapper.test.ts} |   22 +-
 ...pper.test.ts => Llama3ChatWrapper.test.ts} |   22 +-
 .../generic/JinjaTemplateChatWrapper.test.ts  |   45 +-
 .../generic/TemplateChatWrapper.test.ts       |   41 +-
 test/standalone/utils/LlamaText.test.ts       |   30 +-
 35 files changed, 2332 insertions(+), 1140 deletions(-)
 delete mode 100644 src/evaluator/LlamaChat/utils/FunctionCallGrammar.ts
 create mode 100644 src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts
 create mode 100644 src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts
 create mode 100644 src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts
 rename test/standalone/chatWrappers/{ChatMLChatPromptWrapper.test.ts => ChatMLChatWrapper.test.ts} (92%)
 rename test/standalone/chatWrappers/{FalconChatPromptWrapper.test.ts => FalconChatWrapper.test.ts} (89%)
 create mode 100644 test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
 rename test/standalone/chatWrappers/{GeneralChatPromptWrapper.test.ts => GeneralChatWrapper.test.ts} (90%)
 rename test/standalone/chatWrappers/{Llama2ChatPromptWrapper.test.ts => Llama2ChatWrapper.test.ts} (92%)
 rename test/standalone/chatWrappers/{Llama3ChatPromptWrapper.test.ts => Llama3ChatWrapper.test.ts} (93%)

diff --git a/src/ChatWrapper.ts b/src/ChatWrapper.ts
index d6ac73fa..534a42c3 100644
--- a/src/ChatWrapper.ts
+++ b/src/ChatWrapper.ts
@@ -1,9 +1,13 @@
-import {ChatHistoryItem, ChatModelFunctions, ChatModelResponse, ChatWrapperSettings} from "./types.js";
+import {
+    ChatHistoryItem, ChatModelFunctionCall, ChatModelFunctions, ChatModelResponse, ChatWrapperGenerateContextStateOptions,
+    ChatWrapperGeneratedContextState, ChatWrapperSettings
+} from "./types.js";
 import {LlamaText} from "./utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./chatWrappers/utils/ChatModelFunctionsDocumentationGenerator.js";
 
 export abstract class ChatWrapper {
-    public static defaultSetting: ChatWrapperSettings = {
+    public static defaultSettings: ChatWrapperSettings = {
+        supportsSystemMessages: true,
         functions: {
             call: {
                 optionalPrefixSpace: true,
@@ -19,92 +23,154 @@ export abstract class ChatWrapper {
     };
 
     public abstract readonly wrapperName: string;
-    public readonly settings: ChatWrapperSettings = ChatWrapper.defaultSetting;
-
-    public generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[],
-        functionCall?: {
-            initiallyEngaged: boolean,
-            disengageInitiallyEngaged: LlamaText[]
-        }
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public readonly settings: ChatWrapperSettings = ChatWrapper.defaultSettings;
+
+    public generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const texts = historyWithFunctions
             .map((item) => {
                 if (item.type === "system")
-                    return LlamaText(`system: ${item.text}`);
+                    return LlamaText(["system: ", LlamaText.fromJSON(item.text)]);
                 else if (item.type === "user")
-                    return LlamaText(`user: ${item.text}`);
+                    return LlamaText(["user: ", item.text]);
                 else if (item.type === "model")
-                    return LlamaText(`model: ${this.generateModelResponseText(item.response)}`);
+                    return LlamaText(["model: ", this.generateModelResponseText(item.response)]);
 
                 return item satisfies never;
             });
 
         return {
-            contextText: LlamaText(texts).joinValues("\n"),
+            contextText: LlamaText.joinValues("\n", texts),
             stopGenerationTriggers: []
         };
     }
 
-    public generateFunctionCallAndResult(name: string, params: any, result: any): string {
-        return this.generateFunctionCall(name, params) + this.generateFunctionCallResult(name, params, result);
+    public generateFunctionCallsAndResults(functionCalls: ChatModelFunctionCall[], useRawCall: boolean = true) {
+        const calls: LlamaText[] = [];
+        const results: LlamaText[] = [];
+        const res: LlamaText[] = [];
+
+        if (functionCalls.length === 0)
+            return LlamaText([]);
+
+        for (const functionCall of functionCalls) {
+            if (useRawCall && functionCall.rawCall != null)
+                calls.push(LlamaText.fromJSON(functionCall.rawCall));
+            else
+                calls.push(this.generateFunctionCall(functionCall.name, functionCall.params));
+
+            results.push(this.generateFunctionCallResult(functionCall.name, functionCall.params, functionCall.result));
+        }
+
+        if (this.settings.functions.parallelism == null) {
+            for (let i = 0; i < calls.length; i++) {
+                res.push(calls[i]);
+                res.push(results[i]);
+            }
+
+            return LlamaText(res);
+        }
+
+        res.push(LlamaText(this.settings.functions.parallelism.call.sectionPrefix ?? ""));
+        for (let i = 0; i < calls.length; i++) {
+            if (i > 0)
+                res.push(LlamaText(this.settings.functions.parallelism.call.betweenCalls ?? ""));
+
+            res.push(calls[i]);
+        }
+        res.push(LlamaText(this.settings.functions.parallelism.call.sectionSuffix ?? ""));
+
+        res.push(LlamaText(this.settings.functions.parallelism.result?.sectionPrefix ?? ""));
+        for (let i = 0; i < results.length; i++) {
+            if (i > 0)
+                res.push(LlamaText(this.settings.functions.parallelism.result?.betweenResults ?? ""));
+
+            res.push(results[i]);
+        }
+        res.push(LlamaText(this.settings.functions.parallelism.result?.sectionSuffix ?? ""));
+
+        return LlamaText(res);
     }
 
-    public generateFunctionCall(name: string, params: any): string {
-        return this.settings.functions.call.prefix +
-            name +
-            this.settings.functions.call.paramsPrefix +
+    public generateFunctionCall(name: string, params: any): LlamaText {
+        return LlamaText([
+            this.settings.functions.call.prefix,
+            name,
+            this.settings.functions.call.paramsPrefix,
             (
                 params === undefined
                     ? ""
                     : JSON.stringify(params)
-            ) +
-            this.settings.functions.call.suffix;
+            ),
+            this.settings.functions.call.suffix
+        ]);
     }
 
-    public generateFunctionCallResult(functionName: string, functionParams: any, result: any): string {
-        const resolveParameters = (text: string) =>
-            text.replaceAll("{{functionName}}", functionName)
-                .replaceAll("{{functionParams}}", functionParams === undefined ? "" : JSON.stringify(functionParams));
+    public generateFunctionCallResult(functionName: string, functionParams: any, result: any): LlamaText {
+        function resolveParameters(text: string | LlamaText) {
+            return LlamaText(text)
+                .mapValues((value) => {
+                    if (typeof value !== "string")
+                        return value;
+
+                    return value
+                        .replaceAll("{{functionName}}", functionName)
+                        .replaceAll("{{functionParams}}", functionParams === undefined ? "" : JSON.stringify(functionParams));
+                });
+        }
 
-        return resolveParameters(this.settings.functions.result.prefix) +
+        return LlamaText([
+            resolveParameters(this.settings.functions.result.prefix),
             (
                 result === undefined
                     ? "void"
                     : JSON.stringify(result)
-            ) +
-            resolveParameters(this.settings.functions.result.suffix);
+            ),
+            resolveParameters(this.settings.functions.result.suffix)
+        ]);
     }
 
-    public generateModelResponseText(modelResponse: ChatModelResponse["response"]) {
-        return modelResponse
-            .map((item) => {
-                if (typeof item === "string")
-                    return item;
+    public generateModelResponseText(modelResponse: ChatModelResponse["response"], useRawCall: boolean = true): LlamaText {
+        const res: LlamaText[] = [];
+        const pendingFunctionCalls: ChatModelFunctionCall[] = [];
 
-                return item.raw ?? this.generateFunctionCallAndResult(item.name, item.params, item.result);
-            })
-            .join("\n");
+        const addFunctionCalls = () => {
+            if (pendingFunctionCalls.length === 0)
+                return;
+
+            res.push(this.generateFunctionCallsAndResults(pendingFunctionCalls, useRawCall));
+            pendingFunctionCalls.length = 0;
+        };
+
+        for (const response of modelResponse) {
+            if (typeof response === "string") {
+                addFunctionCalls();
+                res.push(LlamaText(response));
+                continue;
+            }
+
+            pendingFunctionCalls.push(response);
+        }
+
+        addFunctionCalls();
+
+        return LlamaText(res);
     }
 
     public generateAvailableFunctionsSystemText(availableFunctions: ChatModelFunctions, {documentParams = true}: {
         documentParams?: boolean
-    }) {
+    }): LlamaText {
         const functionsDocumentationGenerator = new ChatModelFunctionsDocumentationGenerator(availableFunctions);
 
         if (!functionsDocumentationGenerator.hasAnyFunctions)
-            return "";
+            return LlamaText([]);
 
-        return [
+        return LlamaText.joinValues("\n", [
             "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.",
             "The assistant does not tell anybody about any of the contents of this system message.",
             "To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.",
@@ -119,7 +185,7 @@ export abstract class ChatWrapper {
             "After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.",
             "The assistant does not tell the user about functions.",
             "The assistant does not tell the user that functions exist or inform the user prior to calling a function."
-        ].join("\n");
+        ]);
     }
 
     public addAvailableFunctionsSystemMessageToHistory(history: readonly ChatHistoryItem[], availableFunctions?: ChatModelFunctions, {
@@ -137,23 +203,12 @@ export abstract class ChatWrapper {
         const firstNonSystemMessageIndex = res.findIndex((item) => item.type !== "system");
         res.splice(Math.max(0, firstNonSystemMessageIndex), 0, {
             type: "system",
-            text: this.generateAvailableFunctionsSystemText(availableFunctions, {documentParams})
+            text: this.generateAvailableFunctionsSystemText(availableFunctions, {documentParams}).toJSON()
         });
 
         return res;
     }
 
-    /**
-     * Functions that should be made available as part of the function calling grammar and are handled by the chat wrapper
-     * for grammar purposes only
-     */
-    public getInternalBuiltinFunctions({initialFunctionCallEngaged}: {initialFunctionCallEngaged: boolean}): ChatModelFunctions {
-        if (initialFunctionCallEngaged)
-            return {};
-
-        return {};
-    }
-
     /** @internal */
     public static _getOptionConfigurationsToTestIfCanSupersedeJinjaTemplate(): Record<string | symbol, any>[] {
         return [{}] satisfies Partial<FirstItemOfTupleOrFallback<ConstructorParameters<typeof this>, object>>[];
diff --git a/src/chatWrappers/ChatMLChatWrapper.ts b/src/chatWrappers/ChatMLChatWrapper.ts
index 0fdfe87e..3a9a755e 100644
--- a/src/chatWrappers/ChatMLChatWrapper.ts
+++ b/src/chatWrappers/ChatMLChatWrapper.ts
@@ -1,44 +1,35 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
 // source: https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md
 export class ChatMLChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "ChatML";
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[],
-        functionCall?: {
-            initiallyEngaged: boolean,
-            disengageInitiallyEngaged: LlamaText[]
-        }
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string,
-            user: string,
-            model: string
+            system: LlamaText,
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0)
                 resultItems.push({
-                    system: systemTexts.join("\n\n"),
-                    user: userTexts.join("\n\n"),
-                    model: modelTexts.join("\n\n")
+                    system: LlamaText.joinValues("\n\n", systemTexts),
+                    user: LlamaText.joinValues("\n\n", userTexts),
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts = [];
@@ -52,12 +43,12 @@ export class ChatMLChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 flush();
 
                 currentAggregateFocus = null;
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 flush();
 
@@ -75,7 +66,7 @@ export class ChatMLChatWrapper extends ChatWrapper {
                 const isLastItem = index === resultItems.length - 1;
 
                 return LlamaText([
-                    (system.length === 0)
+                    (system.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("<|im_start|>system\n"),
@@ -83,7 +74,7 @@ export class ChatMLChatWrapper extends ChatWrapper {
                             new SpecialTokensText("<|im_end|>\n")
                         ]),
 
-                    (user.length === 0)
+                    (user.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("<|im_start|>user\n"),
@@ -91,7 +82,7 @@ export class ChatMLChatWrapper extends ChatWrapper {
                             new SpecialTokensText("<|im_end|>\n")
                         ]),
 
-                    (model.length === 0 && !isLastItem)
+                    (model.values.length === 0 && !isLastItem)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("<|im_start|>assistant\n"),
diff --git a/src/chatWrappers/FalconChatWrapper.ts b/src/chatWrappers/FalconChatWrapper.ts
index a1fe03ed..f4b82d9d 100644
--- a/src/chatWrappers/FalconChatWrapper.ts
+++ b/src/chatWrappers/FalconChatWrapper.ts
@@ -1,5 +1,5 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js";
 
 export class FalconChatWrapper extends ChatWrapper {
@@ -35,34 +35,30 @@ export class FalconChatWrapper extends ChatWrapper {
         return this._middleSystemMessageTitle;
     }
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[]
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string,
-            user: string,
-            model: string
+            system: LlamaText,
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0)
                 resultItems.push({
-                    system: systemTexts.join("\n\n"),
-                    user: userTexts.join("\n\n"),
-                    model: modelTexts.join("\n\n")
+                    system: LlamaText.joinValues("\n\n", systemTexts),
+                    user: LlamaText.joinValues("\n\n", userTexts),
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts = [];
@@ -76,12 +72,12 @@ export class FalconChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 flush();
 
                 currentAggregateFocus = null;
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 flush();
 
@@ -100,7 +96,7 @@ export class FalconChatWrapper extends ChatWrapper {
                 const isLastItem = index === resultItems.length - 1;
 
                 return LlamaText([
-                    (system.length === 0)
+                    (system.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             isFirstItem
@@ -110,7 +106,7 @@ export class FalconChatWrapper extends ChatWrapper {
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, "\n\n")
                         ]),
 
-                    (user.length === 0)
+                    (user.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, `${this._userMessageTitle}: `),
@@ -118,7 +114,7 @@ export class FalconChatWrapper extends ChatWrapper {
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, "\n\n")
                         ]),
 
-                    (model.length === 0 && !isLastItem)
+                    (model.values.length === 0 && !isLastItem)
                         ? LlamaText([])
                         : LlamaText([
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, `${this._modelResponseTitle}: `),
diff --git a/src/chatWrappers/FunctionaryChatWrapper.ts b/src/chatWrappers/FunctionaryChatWrapper.ts
index b70c8c5c..a40ea578 100644
--- a/src/chatWrappers/FunctionaryChatWrapper.ts
+++ b/src/chatWrappers/FunctionaryChatWrapper.ts
@@ -1,5 +1,8 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions, isChatModelResponseFunctionCall} from "../types.js";
+import {
+    ChatHistoryItem, ChatModelFunctions, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings,
+    isChatModelResponseFunctionCall
+} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctionsDocumentationGenerator.js";
 
@@ -8,35 +11,43 @@ export class FunctionaryChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Functionary";
 
     public override readonly settings = {
+        supportsSystemMessages: true,
         functions: {
             call: {
                 optionalPrefixSpace: true,
-                prefix: "\n<|from|>assistant\n<|recipient|>",
-                paramsPrefix: "\n<|content|>",
-                suffix: "\n"
+                prefix: LlamaText(new SpecialTokensText("\n<|from|>assistant\n<|recipient|>")),
+                paramsPrefix: LlamaText(new SpecialTokensText("\n<|content|>")),
+                suffix: ""
             },
             result: {
-                prefix: "<|from|>{{functionName}}\n<|recipient|>all\n<|content|>",
-                suffix: "\n"
+                prefix: LlamaText([
+                    new SpecialTokensText("\n<|from|>"),
+                    "{{functionName}}",
+                    new SpecialTokensText("\n<|recipient|>all\n<|content|>")
+                ]),
+                suffix: ""
+            },
+            parallelism: {
+                call: {
+                    sectionPrefix: "",
+                    betweenCalls: "\n",
+                    sectionSuffix: LlamaText(new SpecialTokensText("<|stop|>"))
+                },
+                result: {
+                    sectionPrefix: "",
+                    betweenResults: "",
+                    sectionSuffix: ""
+                }
             }
         }
-    };
+    } satisfies ChatWrapperSettings;
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[],
-        functionCall?: {
-            initiallyEngaged: boolean,
-            disengageInitiallyEngaged: LlamaText[]
-        }
-    } {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
         const hasFunctions = Object.keys(availableFunctions ?? {}).length > 0;
 
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
@@ -57,7 +68,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
                         new SpecialTokensText("<|from|>system\n"),
                         new SpecialTokensText("<|recipient|>all\n"),
                         new SpecialTokensText("<|content|>"),
-                        item.text
+                        LlamaText.fromJSON(item.text)
                     ]);
                 } else if (item.type === "user") {
                     return LlamaText([
@@ -80,74 +91,114 @@ export class FunctionaryChatWrapper extends ChatWrapper {
                             new SpecialTokensText("<|content|>")
                         ]);
 
-                    return LlamaText(
-                        item.response.map((response, index) => {
-                            const isFirstResponse = index === 0;
-                            const isLastResponse = index === item.response.length - 1;
+                    const res: LlamaText[] = [];
+                    const pendingFunctionCalls: LlamaText[] = [];
+                    const pendingFunctionResults: LlamaText[] = [];
+
+                    const addPendingFunctions = () => {
+                        if (pendingFunctionResults.length === 0)
+                            return;
+
+                        res.push(LlamaText(pendingFunctionCalls));
+                        res.push(LlamaText(new SpecialTokensText("<|stop|>")));
+                        res.push(LlamaText(pendingFunctionResults));
+
+                        pendingFunctionResults.length = 0;
+                    };
 
-                            if (typeof response === "string")
-                                return LlamaText([
+                    for (let index = 0; index < item.response.length; index++) {
+                        const response = item.response[index];
+                        const isFirstResponse = index === 0;
+
+                        if (typeof response === "string") {
+                            addPendingFunctions();
+                            res.push(
+                                LlamaText([
                                     (isFirstItem && isFirstResponse)
                                         ? LlamaText([])
                                         : new SpecialTokensText("\n"),
                                     new SpecialTokensText("<|from|>assistant\n"),
                                     new SpecialTokensText("<|recipient|>all\n"),
                                     new SpecialTokensText("<|content|>"),
-                                    response,
-                                    (isLastResponse && isLastItem)
-                                        ? ""
-                                        : new SpecialTokensText("<|stop|>")
-                                ]);
-                            else if (isChatModelResponseFunctionCall(response)) {
-                                return LlamaText([
-                                    response.raw != null
-                                        ? LlamaText(
-                                            response.raw.endsWith("\n")
-                                                ? response.raw.slice(0, -"\n".length)
-                                                : response.raw
-                                        )
-                                        : LlamaText([
-                                            (isFirstItem && isFirstResponse)
-                                                ? LlamaText([])
-                                                : new SpecialTokensText("\n"),
-
-                                            new SpecialTokensText("<|from|>assistant\n"),
-                                            new SpecialTokensText("<|recipient|>"), response.name, new SpecialTokensText("\n"),
-                                            new SpecialTokensText("<|content|>"),
-                                            response.params === undefined
-                                                ? ""
-                                                : JSON.stringify(response.params),
-                                            new SpecialTokensText("<|stop|>"),
-
-                                            new SpecialTokensText("\n"),
-                                            new SpecialTokensText("<|from|>"), response.name, new SpecialTokensText("\n"),
-                                            new SpecialTokensText("<|recipient|>all\n"),
-                                            new SpecialTokensText("<|content|>"),
-                                            response.result === undefined
-                                                ? "" // "void"
-                                                : JSON.stringify(response.result)
-                                        ]),
-
-                                    (isLastResponse && isLastItem)
-                                        ? hasFunctions
-                                            ? LlamaText([
-                                                new SpecialTokensText("\n"),
-                                                new SpecialTokensText("<|from|>assistant\n")
-                                            ])
-                                            : LlamaText([
-                                                new SpecialTokensText("\n"),
-                                                new SpecialTokensText("<|from|>assistant\n"),
-                                                new SpecialTokensText("<|recipient|>all\n"),
-                                                new SpecialTokensText("<|content|>")
-                                            ])
-                                        : LlamaText([])
-                                ]);
-                            }
-
+                                    response
+                                ])
+                            );
+                        } else if (isChatModelResponseFunctionCall(response)) {
+                            pendingFunctionCalls.push(
+                                response.rawCall != null
+                                    ? LlamaText.fromJSON(response.rawCall)
+                                    : LlamaText([
+                                        (isFirstItem && isFirstResponse)
+                                            ? LlamaText([])
+                                            : new SpecialTokensText("\n"),
+
+                                        new SpecialTokensText("<|from|>assistant\n"),
+                                        new SpecialTokensText("<|recipient|>"), response.name, new SpecialTokensText("\n"),
+                                        new SpecialTokensText("<|content|>"),
+                                        response.params === undefined
+                                            ? ""
+                                            : JSON.stringify(response.params)
+                                    ])
+                            );
+                            pendingFunctionResults.push(
+                                LlamaText([
+                                    new SpecialTokensText("\n"),
+                                    new SpecialTokensText("<|from|>"), response.name, new SpecialTokensText("\n"),
+                                    new SpecialTokensText("<|recipient|>all\n"),
+                                    new SpecialTokensText("<|content|>"),
+                                    response.result === undefined
+                                        ? "" // "void"
+                                        : JSON.stringify(response.result)
+                                ])
+                            );
+                        } else
                             void (response satisfies never);
-                            return "";
-                        })
-                    );
+                    }
+
+                    addPendingFunctions();
+
+                    if (res.length === 0) {
+                        if (isLastItem) {
+                            if (!hasFunctions)
+                                res.push(
+                                    LlamaText([
+                                        isFirstItem
+                                            ? LlamaText([])
+                                            : new SpecialTokensText("\n"),
+                                        new SpecialTokensText("<|from|>assistant\n"),
+                                        new SpecialTokensText("<|recipient|>all\n"),
+                                        new SpecialTokensText("<|content|>")
+                                    ])
+                                );
+                        } else
+                            res.push(
+                                LlamaText([
+                                    isFirstItem
+                                        ? LlamaText([])
+                                        : new SpecialTokensText("\n"),
+                                    new SpecialTokensText("<|from|>assistant\n"),
+                                    new SpecialTokensText("<|recipient|>all\n"),
+                                    new SpecialTokensText("<|content|>")
+                                ])
+                            );
+                    } else if (isLastItem && typeof item.response[item.response.length - 1] !== "string") {
+                        if (!hasFunctions)
+                            res.push(
+                                LlamaText([
+                                    isFirstItem
+                                        ? LlamaText([])
+                                        : new SpecialTokensText("\n"),
+                                    new SpecialTokensText("<|from|>assistant\n"),
+                                    new SpecialTokensText("<|recipient|>all\n"),
+                                    new SpecialTokensText("<|content|>")
+                                ])
+                            );
+                    }
+
+                    if (!isLastItem)
+                        res.push(LlamaText(new SpecialTokensText("<|stop|>")));
+
+                    return LlamaText(res);
                 }
 
                 void (item satisfies never);
@@ -161,6 +212,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
                 stopGenerationTriggers: [
                     LlamaText(new SpecialToken("EOS")),
                     LlamaText(new SpecialTokensText("<|stop|>")),
+
                     LlamaText(" <|stop|>"),
                     LlamaText("<|stop|>"),
                     LlamaText("\n<|from|>user"),
@@ -177,13 +229,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
         }
 
         const textResponseStart = [
-            " \n",
-            " \n\n",
             "\n",
-            "n\n"
+            "\n\n",
+            " \n",
+            " \n\n"
         ].flatMap((prefix) => [
-            LlamaText(prefix + "<|from|>assistant\n<|recipient|>all\n<|content|>"),
-            LlamaText(new SpecialTokensText(prefix + "<|from|>assistant\n<|recipient|>all\n<|content|>"))
+            LlamaText(new SpecialTokensText(prefix + "<|from|>assistant\n<|recipient|>all\n<|content|>")),
+            LlamaText(prefix + "<|from|>assistant\n<|recipient|>all\n<|content|>")
         ]);
 
         return {
@@ -214,21 +266,21 @@ export class FunctionaryChatWrapper extends ChatWrapper {
         const functionsDocumentationGenerator = new ChatModelFunctionsDocumentationGenerator(availableFunctions);
 
         if (!functionsDocumentationGenerator.hasAnyFunctions)
-            return "";
+            return LlamaText([]);
 
         const availableFunctionNames = Object.keys(availableFunctions ?? {});
 
         if (availableFunctionNames.length === 0)
-            return "";
+            return LlamaText([]);
 
-        return [
+        return LlamaText.joinValues("\n", [
             "// Supported function definitions that should be called when necessary.",
             "namespace functions {",
             "",
             functionsDocumentationGenerator.getTypeScriptFunctionTypes({documentParams, reservedFunctionNames: ["all"]}),
             "",
             "} // namespace functions"
-        ].join("\n");
+        ]);
     }
 
     public override addAvailableFunctionsSystemMessageToHistory(
@@ -253,7 +305,7 @@ export class FunctionaryChatWrapper extends ChatWrapper {
             0,
             {
                 type: "system",
-                text: this.generateAvailableFunctionsSystemText(availableFunctions, {documentParams})
+                text: this.generateAvailableFunctionsSystemText(availableFunctions, {documentParams}).toJSON()
             }, {
                 type: "system",
                 text: "The assistant calls functions with appropriate input when necessary. The assistant writes <|stop|> when finished answering."
@@ -261,13 +313,4 @@ export class FunctionaryChatWrapper extends ChatWrapper {
 
         return res;
     }
-
-    public override getInternalBuiltinFunctions({initialFunctionCallEngaged}: {initialFunctionCallEngaged: boolean}): ChatModelFunctions {
-        if (initialFunctionCallEngaged)
-            return {
-                all: {}
-            };
-
-        return {};
-    }
 }
diff --git a/src/chatWrappers/GemmaChatWrapper.ts b/src/chatWrappers/GemmaChatWrapper.ts
index edff479e..fe937d1b 100644
--- a/src/chatWrappers/GemmaChatWrapper.ts
+++ b/src/chatWrappers/GemmaChatWrapper.ts
@@ -1,5 +1,5 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
 // source: https://ai.google.dev/gemma/docs/formatting
@@ -7,48 +7,48 @@ import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js"
 export class GemmaChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Gemma";
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[],
-        functionCall?: {
-            initiallyEngaged: boolean,
-            disengageInitiallyEngaged: LlamaText[]
-        }
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override readonly settings: ChatWrapperSettings = {
+        ...ChatWrapper.defaultSettings,
+        supportsSystemMessages: false
+    };
+
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            user: string,
-            model: string
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | "user" | "model" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0) {
-                const systemText = systemTexts.join("\n\n");
-                let userText = userTexts.join("\n\n");
+                const systemText = LlamaText.joinValues("\n\n", systemTexts);
+                let userText = LlamaText.joinValues("\n\n", userTexts);
 
                 // there's no system prompt support in Gemma, so we'll prepend the system text to the user message
-                if (systemText.length > 0) {
-                    if (userText.length === 0)
+                if (systemText.values.length > 0) {
+                    if (userText.values.length === 0)
                         userText = systemText;
                     else
-                        userText = systemText + "\n\n---\n\n" + userText;
+                        userText = LlamaText([
+                            systemText,
+                            "\n\n---\n\n",
+                            userText
+                        ]);
 
                 }
                 resultItems.push({
                     user: userText,
-                    model: modelTexts.join("\n\n")
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
             }
 
@@ -63,13 +63,13 @@ export class GemmaChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 if (currentAggregateFocus !== "system" && currentAggregateFocus !== "user")
                     flush();
 
                 currentAggregateFocus = "user";
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 currentAggregateFocus = "model";
                 modelTexts.push(this.generateModelResponseText(item.response));
@@ -84,7 +84,7 @@ export class GemmaChatWrapper extends ChatWrapper {
                 const isLastItem = index === resultItems.length - 1;
 
                 return LlamaText([
-                    (user.length === 0)
+                    (user.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("<start_of_turn>user\n"),
@@ -92,7 +92,7 @@ export class GemmaChatWrapper extends ChatWrapper {
                             new SpecialTokensText("<end_of_turn>\n")
                         ]),
 
-                    (model.length === 0 && !isLastItem)
+                    (model.values.length === 0 && !isLastItem)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("<start_of_turn>model\n"),
diff --git a/src/chatWrappers/GeneralChatWrapper.ts b/src/chatWrappers/GeneralChatWrapper.ts
index 3c591951..c31ff690 100644
--- a/src/chatWrappers/GeneralChatWrapper.ts
+++ b/src/chatWrappers/GeneralChatWrapper.ts
@@ -1,5 +1,5 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
 export class GeneralChatWrapper extends ChatWrapper {
@@ -36,34 +36,30 @@ export class GeneralChatWrapper extends ChatWrapper {
         return this._middleSystemMessageTitle;
     }
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[]
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string,
-            user: string,
-            model: string
+            system: LlamaText,
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0)
                 resultItems.push({
-                    system: systemTexts.join("\n\n"),
-                    user: userTexts.join("\n\n"),
-                    model: modelTexts.join("\n\n")
+                    system: LlamaText.joinValues("\n\n", systemTexts),
+                    user: LlamaText.joinValues("\n\n", userTexts),
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts = [];
@@ -77,12 +73,12 @@ export class GeneralChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 flush();
 
                 currentAggregateFocus = null;
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 flush();
 
@@ -101,7 +97,7 @@ export class GeneralChatWrapper extends ChatWrapper {
                 const isLastItem = index === resultItems.length - 1;
 
                 return LlamaText([
-                    (system.length === 0)
+                    (system.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             isFirstItem
@@ -111,7 +107,7 @@ export class GeneralChatWrapper extends ChatWrapper {
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, "\n\n")
                         ]),
 
-                    (user.length === 0)
+                    (user.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, `### ${this._userMessageTitle}\n`),
@@ -119,7 +115,7 @@ export class GeneralChatWrapper extends ChatWrapper {
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, "\n\n")
                         ]),
 
-                    (model.length === 0 && !isLastItem)
+                    (model.values.length === 0 && !isLastItem)
                         ? LlamaText([])
                         : LlamaText([
                             SpecialTokensText.wrapIf(this._allowSpecialTokensInTitles, `### ${this._modelResponseTitle}\n`),
diff --git a/src/chatWrappers/Llama2ChatWrapper.ts b/src/chatWrappers/Llama2ChatWrapper.ts
index 80e2b0e8..7e27b07e 100644
--- a/src/chatWrappers/Llama2ChatWrapper.ts
+++ b/src/chatWrappers/Llama2ChatWrapper.ts
@@ -1,5 +1,5 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
 // source: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
@@ -21,35 +21,30 @@ export class Llama2ChatWrapper extends ChatWrapper {
         this._addSpaceBeforeEos = addSpaceBeforeEos;
     }
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[]
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string,
-            user: string,
-            model: string
+            system: LlamaText,
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | "user" | "model" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0)
                 resultItems.push({
-                    system: systemTexts.join("\n\n"),
-                    user: userTexts.join("\n\n"),
-                    model: modelTexts.join("\n\n")
+                    system: LlamaText.joinValues("\n\n", systemTexts),
+                    user: LlamaText.joinValues("\n\n", userTexts),
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts = [];
@@ -63,13 +58,13 @@ export class Llama2ChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 if (currentAggregateFocus !== "system" && currentAggregateFocus !== "user")
                     flush();
 
                 currentAggregateFocus = "user";
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 currentAggregateFocus = "model";
                 modelTexts.push(this.generateModelResponseText(item.response));
@@ -85,11 +80,11 @@ export class Llama2ChatWrapper extends ChatWrapper {
 
                 return LlamaText([
                     new SpecialToken("BOS"),
-                    (system.length === 0 && user.length === 0)
+                    (system.values.length === 0 && user.values.length === 0)
                         ? LlamaText([])
                         : LlamaText([
                             new SpecialTokensText("[INST] "),
-                            system.length === 0
+                            system.values.length === 0
                                 ? LlamaText([])
                                 : LlamaText([
                                     new SpecialTokensText("<<SYS>>\n"),
diff --git a/src/chatWrappers/Llama3ChatWrapper.ts b/src/chatWrappers/Llama3ChatWrapper.ts
index fce86826..ce8d61b1 100644
--- a/src/chatWrappers/Llama3ChatWrapper.ts
+++ b/src/chatWrappers/Llama3ChatWrapper.ts
@@ -1,5 +1,7 @@
 import {ChatWrapper} from "../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelFunctions} from "../types.js";
+import {
+    ChatModelFunctions, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings
+} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctionsDocumentationGenerator.js";
 
@@ -8,7 +10,8 @@ import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctio
 export class Llama3ChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Llama3Chat";
 
-    public override readonly settings = {
+    public override readonly settings: ChatWrapperSettings = {
+        supportsSystemMessages: true,
         functions: {
             call: {
                 optionalPrefixSpace: true,
@@ -23,27 +26,22 @@ export class Llama3ChatWrapper extends ChatWrapper {
         }
     };
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[]
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string | null,
-            user: string | null,
-            model: string | null
+            system: LlamaText | null,
+            user: LlamaText | null,
+            model: LlamaText | null
         }> = [];
 
-        let systemTexts: string[] = [];
-        let userTexts: string[] = [];
-        let modelTexts: string[] = [];
+        let systemTexts: LlamaText[] = [];
+        let userTexts: LlamaText[] = [];
+        let modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | "user" | "model" | null = null;
 
         function flush() {
@@ -51,13 +49,13 @@ export class Llama3ChatWrapper extends ChatWrapper {
                 resultItems.push({
                     system: systemTexts.length === 0
                         ? null
-                        : systemTexts.join("\n\n"),
+                        : LlamaText.joinValues("\n\n", systemTexts),
                     user: userTexts.length === 0
                         ? null
-                        : userTexts.join("\n\n"),
+                        : LlamaText.joinValues("\n\n", userTexts),
                     model: modelTexts.length === 0
                         ? null
-                        : modelTexts.join("\n\n")
+                        : LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts = [];
@@ -71,13 +69,13 @@ export class Llama3ChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 if (currentAggregateFocus !== "user")
                     flush();
 
                 currentAggregateFocus = "user";
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 if (currentAggregateFocus !== "model")
                     flush();
@@ -150,9 +148,9 @@ export class Llama3ChatWrapper extends ChatWrapper {
         const functionsDocumentationGenerator = new ChatModelFunctionsDocumentationGenerator(availableFunctions);
 
         if (!functionsDocumentationGenerator.hasAnyFunctions)
-            return "";
+            return LlamaText([]);
 
-        return [
+        return LlamaText.joinValues("\n", [
             "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.",
             "To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.",
             "Provided functions:",
@@ -164,6 +162,6 @@ export class Llama3ChatWrapper extends ChatWrapper {
             this.generateFunctionCall("functionName", {someKey: "someValue"}),
             "",
             "After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards."
-        ].join("\n");
+        ]);
     }
 }
diff --git a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
index da213c4c..ef3a0e99 100644
--- a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
@@ -1,6 +1,8 @@
 import {Template} from "@huggingface/jinja";
 import {splitText} from "lifecycle-utils";
-import {ChatHistoryItem, ChatModelFunctions, ChatUserMessage, ChatWrapperSettings} from "../../types.js";
+import {
+    ChatHistoryItem, ChatUserMessage, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings
+} from "../../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../../utils/LlamaText.js";
 import {ChatWrapper} from "../../ChatWrapper.js";
 import {ChatHistoryFunctionCallMessageTemplate, parseFunctionCallMessageTemplate} from "./utils/chatHistoryFunctionCallMessageTemplate.js";
@@ -102,25 +104,28 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
         this.trimLeadingWhitespaceInResponses = trimLeadingWhitespaceInResponses;
 
         this.settings = {
-            ...ChatWrapper.defaultSetting,
-            functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSetting.functions
+            ...ChatWrapper.defaultSettings,
+            functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSettings.functions
         };
 
         if (this.convertUnsupportedSystemMessagesToUserMessages != null && !this.convertUnsupportedSystemMessagesToUserMessages.format.includes("{{message}}"))
             throw new Error('convertUnsupportedSystemMessagesToUserMessages format must include "{{message}}"');
 
         this._jinjaTemplate = new Template(this.template);
-        this._runSanityTest();
+
+        const {supportsSystemMessages} = this._runSanityTest();
+        this.settings = {
+            ...this.settings,
+            supportsSystemMessages
+        };
     }
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[]
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState & {
+        transformedSystemMessagesToUserMessages: boolean
     } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
@@ -153,31 +158,38 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
     }): {
         contextText: LlamaText,
         stopGenerationTriggers: LlamaText[],
-        ignoreStartText?: LlamaText[]
+        ignoreStartText?: LlamaText[],
+        transformedSystemMessagesToUserMessages: boolean
     } {
+        let transformedSystemMessagesToUserMessages = false;
         const transformedHistory = convertSystemMessagesToUserMessagesFormat == null
             ? history
             : history.map((item) => {
-                if (item.type === "system")
+                if (item.type === "system") {
+                    transformedSystemMessagesToUserMessages = true;
                     return {
                         type: "user",
-                        text: convertSystemMessagesToUserMessagesFormat.replaceAll("{{message}}", item.text)
+                        text: LlamaText.joinValues(
+                            LlamaText.fromJSON(item.text),
+                            convertSystemMessagesToUserMessagesFormat.split("{{message}}")
+                        ).toString()
                     } satisfies ChatUserMessage;
+                }
 
                 return item;
             });
 
         const resultItems: Array<{
             role: "system" | "user" | "model",
-            content: string
+            content: LlamaText
         }> = [];
 
-        const currentTexts: string[] = [];
+        const currentTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | "user" | "model" | null = null;
 
         function flush() {
             if (currentTexts.length > 0 && currentAggregateFocus != null)
-                resultItems.push({role: currentAggregateFocus, content: currentTexts.join("\n\n")});
+                resultItems.push({role: currentAggregateFocus, content: LlamaText.joinValues("\n\n", currentTexts)});
 
             currentTexts.length = 0;
         }
@@ -188,13 +200,13 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                currentTexts.push(item.text);
+                currentTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 if (!this.joinAdjacentMessagesOfTheSameType || currentAggregateFocus !== "user")
                     flush();
 
                 currentAggregateFocus = "user";
-                currentTexts.push(item.text);
+                currentTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 if (!this.joinAdjacentMessagesOfTheSameType || currentAggregateFocus !== "model")
                     flush();
@@ -210,7 +222,7 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
 
         const idsGenerator = new UniqueTemplateId(
             this.template + this.modelRoleName + this.userRoleName + this.systemRoleName +
-            (convertSystemMessagesToUserMessagesFormat ?? "") + resultItems.map(({content}) => content).join("\n\n")
+            (convertSystemMessagesToUserMessagesFormat ?? "") + resultItems.map(({content}) => content.toString()).join("\n\n")
         );
 
         const jinjaItems: Array<{
@@ -222,7 +234,7 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
             user: this.userRoleName,
             model: this.modelRoleName
         } as const;
-        const idToContent = new Map<string, string | SpecialToken>();
+        const idToContent = new Map<string, LlamaText | SpecialToken>();
         const modelMessageIds = new Set<string>();
         const messageIds = new Set<string>();
 
@@ -375,7 +387,8 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
                             )
                         ]
                 )
-            ]
+            ],
+            transformedSystemMessagesToUserMessages
         };
     }
 
@@ -385,9 +398,16 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
      */
     private _runSanityTest() {
         try {
+            let supportsSystemMessages = true;
+
             for (const chatHistory of chatHistoriesForSanityTest) {
-                this.generateContextText(chatHistory);
+                const {transformedSystemMessagesToUserMessages} = this.generateContextState({chatHistory});
+
+                if (transformedSystemMessagesToUserMessages)
+                    supportsSystemMessages = false;
             }
+
+            return {supportsSystemMessages};
         } catch (err) {
             throw new Error("The provided Jinja template failed that sanity test: " + String(err));
         }
diff --git a/src/chatWrappers/generic/TemplateChatWrapper.ts b/src/chatWrappers/generic/TemplateChatWrapper.ts
index 112250e7..152f73b0 100644
--- a/src/chatWrappers/generic/TemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/TemplateChatWrapper.ts
@@ -1,4 +1,4 @@
-import {ChatHistoryItem, ChatModelFunctions, ChatWrapperSettings} from "../../types.js";
+import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings} from "../../types.js";
 import {SpecialToken, LlamaText, LlamaTextValue, SpecialTokensText} from "../../utils/LlamaText.js";
 import {ChatWrapper} from "../../ChatWrapper.js";
 import {parseTextTemplate} from "../../utils/parseTextTemplate.js";
@@ -86,39 +86,35 @@ export class TemplateChatWrapper extends ChatWrapper {
         this._parsedChatHistoryTemplate = parseChatHistoryTemplate(historyTemplate);
 
         this.settings = {
-            ...ChatWrapper.defaultSetting,
-            functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSetting.functions
+            ...ChatWrapper.defaultSettings,
+            functions: parseFunctionCallMessageTemplate(functionCallMessageTemplate) ?? ChatWrapper.defaultSettings.functions
         };
     }
 
-    public override generateContextText(history: readonly ChatHistoryItem[], {availableFunctions, documentFunctionParams}: {
-        availableFunctions?: ChatModelFunctions,
-        documentFunctionParams?: boolean
-    } = {}): {
-        contextText: LlamaText,
-        stopGenerationTriggers: LlamaText[]
-    } {
-        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(history, availableFunctions, {
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
             documentParams: documentFunctionParams
         });
 
         const resultItems: Array<{
-            system: string,
-            user: string,
-            model: string
+            system: LlamaText,
+            user: LlamaText,
+            model: LlamaText
         }> = [];
 
-        const systemTexts: string[] = [];
-        const userTexts: string[] = [];
-        const modelTexts: string[] = [];
+        const systemTexts: LlamaText[] = [];
+        const userTexts: LlamaText[] = [];
+        const modelTexts: LlamaText[] = [];
         let currentAggregateFocus: "system" | "user" | "model" | null = null;
 
         function flush() {
             if (systemTexts.length > 0 || userTexts.length > 0 || modelTexts.length > 0)
                 resultItems.push({
-                    system: systemTexts.join("\n\n"),
-                    user: userTexts.join("\n\n"),
-                    model: modelTexts.join("\n\n")
+                    system: LlamaText.joinValues("\n\n", systemTexts),
+                    user: LlamaText.joinValues("\n\n", userTexts),
+                    model: LlamaText.joinValues("\n\n", modelTexts)
                 });
 
             systemTexts.length = 0;
@@ -132,13 +128,13 @@ export class TemplateChatWrapper extends ChatWrapper {
                     flush();
 
                 currentAggregateFocus = "system";
-                systemTexts.push(item.text);
+                systemTexts.push(LlamaText.fromJSON(item.text));
             } else if (item.type === "user") {
                 if (!this.joinAdjacentMessagesOfTheSameType || (currentAggregateFocus !== "system" && currentAggregateFocus !== "user"))
                     flush();
 
                 currentAggregateFocus = "user";
-                userTexts.push(item.text);
+                userTexts.push(LlamaText(item.text));
             } else if (item.type === "model") {
                 if (!this.joinAdjacentMessagesOfTheSameType)
                     flush();
@@ -151,7 +147,7 @@ export class TemplateChatWrapper extends ChatWrapper {
 
         flush();
 
-        const getHistoryItem = (role: "system" | "user" | "model", text: string, prefix?: string | null) => {
+        const getHistoryItem = (role: "system" | "user" | "model", text: LlamaText, prefix?: string | null) => {
             const {roleNamePrefix, messagePrefix, messageSuffix} = this._parsedChatHistoryTemplate;
             return LlamaText([
                 new SpecialTokensText((prefix ?? "") + roleNamePrefix + role + messagePrefix),
@@ -167,7 +163,7 @@ export class TemplateChatWrapper extends ChatWrapper {
 
                 const res = LlamaText([
                     isFirstItem
-                        ? system.length === 0
+                        ? system.values.length === 0
                             ? new SpecialTokensText(
                                 (this._parsedChatTemplate.systemPromptPrefix ?? "") + this._parsedChatTemplate.historyPrefix
                             )
@@ -178,16 +174,16 @@ export class TemplateChatWrapper extends ChatWrapper {
                                     new SpecialTokensText(this._parsedChatTemplate.historyPrefix)
                                 ])
                                 : getHistoryItem("system", system, this._parsedChatTemplate.historyPrefix)
-                        : system.length === 0
+                        : system.values.length === 0
                             ? LlamaText([])
                             : getHistoryItem("system", system),
 
 
-                    user.length === 0
+                    user.values.length === 0
                         ? LlamaText([])
                         : getHistoryItem("user", user),
 
-                    model.length === 0
+                    model.values.length === 0
                         ? LlamaText([])
                         : !isLastItem
                             ? getHistoryItem("model", model)
diff --git a/src/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.ts b/src/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.ts
index b9c314ea..a17a78a1 100644
--- a/src/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.ts
+++ b/src/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.ts
@@ -64,14 +64,24 @@ export function isJinjaTemplateEquivalentToSpecializedChatWrapper(
                             if (index === 0 && array.length > 1 && array[1].type === "user") {
                                 array[1] = {
                                     type: "user",
-                                    text: convertSystemMessagesToUserMessagesTemplate.replace("{{message}}", item.text) + "\n\n" + array[1].text
+                                    text: LlamaText([
+                                        LlamaText.joinValues(
+                                            LlamaText.fromJSON(item.text),
+                                            convertSystemMessagesToUserMessagesTemplate.split("{{message}}")
+                                        ),
+                                        "\n\n",
+                                        array[1].text
+                                    ]).toString()
                                 } satisfies ChatHistoryItem;
                                 return null;
                             }
 
                             return {
                                 type: "user",
-                                text: convertSystemMessagesToUserMessagesTemplate.replace("{{message}}", item.text)
+                                text: LlamaText.joinValues(
+                                    LlamaText.fromJSON(item.text),
+                                    convertSystemMessagesToUserMessagesTemplate.split("{{message}}")
+                                ).toString()
                             } satisfies ChatHistoryItem;
                         }
 
@@ -111,8 +121,8 @@ function checkEquivalence(
     tokenizer?: Tokenizer
 ): boolean {
     for (const testChatHistory of testChatHistories) {
-        const jinjaRes = jinjaChatWrapper.generateContextText(testChatHistory);
-        const specializedWrapperRes = specializedChatWrapper.generateContextText(testChatHistory);
+        const jinjaRes = jinjaChatWrapper.generateContextState({chatHistory: testChatHistory});
+        const specializedWrapperRes = specializedChatWrapper.generateContextState({chatHistory: testChatHistory});
 
         if (!compareContextTexts(jinjaRes.contextText, specializedWrapperRes.contextText, tokenizer))
             return false;
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 2aa54e65..79d112b9 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -8,7 +8,7 @@ import {GbnfJsonSchemaToType} from "../../utils/gbnfJson/types.js";
 import {LlamaGrammar} from "../LlamaGrammar.js";
 import {removeNullFields} from "../../utils/removeNullFields.js";
 import {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
-import {LlamaText} from "../../utils/LlamaText.js";
+import {LlamaText, LlamaTextJSON, SpecialToken} from "../../utils/LlamaText.js";
 import {StopGenerationDetector} from "../../utils/StopGenerationDetector.js";
 import {QueuedTokenRelease, QueuedTokenReleaseLock, TokenStreamRegulator} from "../../utils/TokenStreamRegulator.js";
 import {EvaluationPriority} from "../LlamaContext/types.js";
@@ -17,11 +17,11 @@ import {getQueuedTokensBeforeStopTrigger} from "../../utils/getQueuedTokensBefor
 import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
 import {GeneralChatWrapper} from "../../chatWrappers/GeneralChatWrapper.js";
 import {TokenBias} from "../TokenBias.js";
-import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 import {
     eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy
 } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
-import {FunctionCallGrammar, LlamaFunctionCallValidationError} from "./utils/FunctionCallGrammar.js";
+import {FunctionCallNameGrammar} from "./utils/FunctionCallNameGrammar.js";
+import {FunctionCallParamsGrammar} from "./utils/FunctionCallParamsGrammar.js";
 
 export type LlamaChatOptions = {
     contextSequence: LlamaContextSequence,
@@ -138,11 +138,13 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
 } & ({
     grammar?: LlamaGrammar,
     functions?: never,
-    documentFunctionParams?: never
+    documentFunctionParams?: never,
+    maxParallelFunctionCalls?: never
 } | {
     grammar?: never,
     functions?: Functions | ChatModelFunctions,
-    documentFunctionParams?: boolean
+    documentFunctionParams?: boolean,
+    maxParallelFunctionCalls?: number
 });
 
 export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
@@ -338,6 +340,7 @@ export class LlamaChat {
             evaluationPriority = defaultEvaluationPriority,
             functions,
             documentFunctionParams,
+            maxParallelFunctionCalls,
             contextShift = defaultContextShiftOptions,
             customStopTriggers,
             lastEvaluationContextWindow: {
@@ -366,6 +369,7 @@ export class LlamaChat {
                 evaluationPriority,
                 functions,
                 documentFunctionParams,
+                maxParallelFunctionCalls,
                 contextShift,
                 customStopTriggers,
                 lastEvaluationContextWindow: {
@@ -382,39 +386,56 @@ export class LlamaChat {
             try {
                 generateResponseState.ensureLastHistoryItemIsModel();
 
-                // eslint-disable-next-line no-constant-condition
-                while (true) {
-                    generateResponseState.startTokenLoop();
+                const loadContextWindow = async (avoidReloadingHistory: boolean = false) => {
                     await generateResponseState.loadContextWindow(
                         generateResponseState.getResolvedHistoryWithCurrentModelResponse(),
-                        false
+                        false,
+                        avoidReloadingHistory
                     );
+                };
+                const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true);
+
+                // eslint-disable-next-line no-constant-condition
+                while (true) {
+                    generateResponseState.startTokenLoop();
+                    generateResponseState.canAvoidReloadingHistory = false;
+                    await loadContextWindow();
+
+                    generateResponseState.addStopGenerationTriggersFromChatWrapper();
 
                     if (generateResponseState.generatedTokens === 0) {
                         generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
-                        generateResponseState.addFunctionSyntaxEndTriggersFromFunctionsGrammar();
 
                         if (generateResponseState.functionsEnabled) {
                             generateResponseState.initFunctions();
                         }
                     }
 
-                    generateResponseState.addStopGenerationTriggersFromChatWrapper();
-                    await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
+                    if (generateResponseState.functionEvaluationMode !== false) {
+                        const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(
+                            loadContextWindowForFunctionCallingLoop
+                        );
+                        if (functionsCallsRes != null)
+                            return functionsCallsRes;
 
+                        await loadContextWindowForFunctionCallingLoop();
+                    }
+
+                    await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
                     await generateResponseState.createNewEvaluationIterator();
+
                     while (await generateResponseState.iterateEvaluation()) {
                         generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
 
-                        generateResponseState.trackGenerationForDisengageInitiallyEngagedFunctionMode();
-                        generateResponseState.trackFunctionSyntaxStart();
-
-                        generateResponseState.handleInitiallyEngagedFunctionModeFunctionDetection();
-                        generateResponseState.handleFunctionSyntax();
-
-                        const functionEndSyntaxRes = generateResponseState.detectFunctionEndSyntax("model");
-                        if (functionEndSyntaxRes != null)
-                            return functionEndSyntaxRes;
+                        generateResponseState.detectAndHandleFunctionStartSyntax();
+                        if (generateResponseState.functionEvaluationMode !== false) {
+                            generateResponseState.canAvoidReloadingHistory = false;
+                            const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(
+                                loadContextWindowForFunctionCallingLoop
+                            );
+                            if (functionsCallsRes != null)
+                                return functionsCallsRes;
+                        }
 
                         generateResponseState.recordStopGenerationEvaluation();
 
@@ -544,7 +565,7 @@ export class LlamaChat {
                         ),
                         true
                     );
-                    generateResponseState.inFunctionEvaluationMode = false;
+                    generateResponseState.functionEvaluationMode = false;
 
                     generateResponseState.addStopGenerationTriggersFromChatWrapper();
 
@@ -649,8 +670,8 @@ export class LlamaChat {
 
 export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
     response: string,
-    functionCall?: Functions extends ChatModelFunctions
-        ? LlamaChatResponseFunctionCall<Functions>
+    functionCalls?: Functions extends ChatModelFunctions
+        ? LlamaChatResponseFunctionCall<Functions>[]
         : never,
     lastEvaluation: {
         cleanHistory: ChatHistoryItem[],
@@ -659,7 +680,7 @@ export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined =
     },
     metadata: {
         remainingGenerationAfterStop?: string | Token[],
-        stopReason: "eogToken" | "stopGenerationTrigger" | "functionCall" | "maxTokens" | "abort"
+        stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort"
     } | {
         remainingGenerationAfterStop?: string | Token[],
         stopReason: "customStopTrigger",
@@ -676,7 +697,7 @@ export type LlamaChatResponseFunctionCall<
 > = {
     functionName: FunctionCallName,
     params: Params,
-    raw: string
+    raw: LlamaTextJSON
 };
 
 export type LlamaChatLoadAndCompleteUserResponse = {
@@ -708,7 +729,7 @@ function removeRawFromHistoryItem<Item extends ChatHistoryItem>(historyItem: Ite
             else
                 return {
                     ...item,
-                    raw: undefined
+                    rawCall: undefined
                 };
         });
 
@@ -743,7 +764,8 @@ async function compressHistoryToFitContextSize({
     metadata: LLamaChatContextShiftOptions["lastEvaluationMetadata"]
 }> {
     function checkIfHistoryFitsContext(history: ChatHistoryItem[]) {
-        const {contextText} = chatWrapper.generateContextText(history, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: history,
             availableFunctions: functions,
             documentFunctionParams
         });
@@ -829,7 +851,7 @@ function getLastTextModelResponseFromChatHistory(chatHistory: ChatHistoryItem[])
     return "";
 }
 
-function getLastUserTextFromChatHistory(chatHistory: ChatHistoryItem[]) {
+function getLastUserTextFromChatHistory(chatHistory: readonly ChatHistoryItem[]) {
     if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "user")
         return "";
 
@@ -862,7 +884,7 @@ function setLastModelTextResponseInChatHistory(chatHistory: ChatHistoryItem[], t
     return newChatHistory;
 }
 
-function setLastUserTextInChatHistory(chatHistory: ChatHistoryItem[], userText: string) {
+function setLastUserTextInChatHistory(chatHistory: readonly ChatHistoryItem[], userText: string) {
     const newChatHistory = chatHistory.slice();
     if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user")
         newChatHistory.push({
@@ -889,28 +911,27 @@ function setLastTextInChatHistory(itemType: "user" | "model", chatHistory: ChatH
 function generateContextText(
     endWithUserText: boolean,
     chatWrapper: ChatWrapper,
-    chatHistory: ChatHistoryItem[],
-    options?: Parameters<typeof chatWrapper.generateContextText>[1]
+    options: Parameters<typeof chatWrapper.generateContextState>[0]
 ): ReturnType<typeof generateContextTextThatEndsWithUserText> {
     if (endWithUserText)
-        return generateContextTextThatEndsWithUserText(chatWrapper, chatHistory, options);
+        return generateContextTextThatEndsWithUserText(chatWrapper, options);
 
-    return chatWrapper.generateContextText(chatHistory, options);
+    return chatWrapper.generateContextState(options);
 }
 
 function generateContextTextThatEndsWithUserText(
-    chatWrapper: ChatWrapper, chatHistory: ChatHistoryItem[], options?: Parameters<typeof chatWrapper.generateContextText>[1]
-): ReturnType<typeof chatWrapper.generateContextText> & {
+    chatWrapper: ChatWrapper, options: Parameters<typeof chatWrapper.generateContextState>[0]
+): ReturnType<typeof chatWrapper.generateContextState> & {
     userTextSuffix?: LlamaText
 } {
-    const lastUserText = getLastUserTextFromChatHistory(chatHistory);
+    const lastUserText = getLastUserTextFromChatHistory(options.chatHistory);
     const randomId = "W" + (Math.random()
         .toString(36)
         .slice(2)) + "W";
-    const {contextText, ...rest} = chatWrapper.generateContextText(
-        setLastUserTextInChatHistory(chatHistory, lastUserText + randomId),
-        options
-    );
+    const {contextText, ...rest} = chatWrapper.generateContextState({
+        ...options,
+        chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId)
+    });
     let newContextText = contextText;
 
     for (let i = 0; i < newContextText.values.length; i++) {
@@ -984,8 +1005,8 @@ async function getContextWindow({
         const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
             endWithUserText,
             chatWrapper,
-            newContextWindow,
             {
+                chatHistory: newContextWindow,
                 availableFunctions: functions,
                 documentFunctionParams
             }
@@ -1038,8 +1059,8 @@ async function getContextWindow({
         const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
             endWithUserText,
             chatWrapper,
-            compressedHistory,
             {
+                chatHistory: compressedHistory,
                 availableFunctions: functions,
                 documentFunctionParams
             }
@@ -1062,8 +1083,8 @@ async function getContextWindow({
         const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
             endWithUserText,
             chatWrapper,
-            resolvedHistory,
             {
+                chatHistory: resolvedHistory,
                 availableFunctions: functions,
                 documentFunctionParams
             }
@@ -1114,8 +1135,8 @@ async function getContextWindow({
     const {contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix} = generateContextText(
         endWithUserText,
         chatWrapper,
-        compressedHistory,
         {
+            chatHistory: compressedHistory,
             availableFunctions: functions,
             documentFunctionParams
         }
@@ -1153,6 +1174,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private readonly evaluationPriority: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
     private readonly functions: LLamaChatGenerateResponseOptions<Functions>["functions"];
     private readonly documentFunctionParams: LLamaChatGenerateResponseOptions<Functions>["documentFunctionParams"];
+    private readonly maxParallelFunctionCalls: LLamaChatGenerateResponseOptions<Functions>["maxParallelFunctionCalls"];
     private readonly contextShift: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
     private readonly customStopTriggers: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
     private readonly lastEvaluationContextWindowHistory: Exclude<LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"], undefined>["history"];
@@ -1166,45 +1188,53 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     };
     private readonly lastModelResponse: string;
     private readonly grammarEvaluationState: LlamaGrammarEvaluationState | undefined;
-    private functionsGrammar: FunctionCallGrammar<NonNullable<Functions>> | undefined;
+    private readonly functionNameGrammar?: FunctionCallNameGrammar<NonNullable<Functions>>;
+    private functionsGrammar?: FunctionCallNameGrammar<NonNullable<Functions>> | FunctionCallParamsGrammar<NonNullable<Functions>>;
     private functionsEvaluationState: LlamaGrammarEvaluationState | undefined;
 
     private readonly streamRegulator = new TokenStreamRegulator();
     public readonly stopGenerationDetector = new StopGenerationDetector();
     private readonly customStopGenerationTriggersDetector = new StopGenerationDetector();
     private readonly functionSyntaxStartDetector = new StopGenerationDetector();
-    private readonly functionSyntaxEndDetector = new StopGenerationDetector();
     private readonly disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
     private readonly ignoreStartTextDetector = new StopGenerationDetector();
     private readonly locksToReleaseOnValidGeneration: QueuedTokenReleaseLock[] = [];
-    private readonly functionCallTokenSyntaxLocks: QueuedTokenReleaseLock[] = [];
 
     public resolvedHistory: ChatHistoryItem[];
 
     public readonly res: Token[] = [];
     public readonly pendingTokens: Token[] = [];
     public ignoredStartTextTokens: Token[] = [];
-    public readonly functionCallTokens: Token[] = [];
+    public readonly resFunctionCalls: Array<{
+        functionName: string,
+        params: any,
+        raw: LlamaText
+    }> = [];
+
+    public functionEvaluationMode: false | "prefixOrDisengage" | "functionName" | "params" | "sectionSuffixOrBetweenCalls" = false;
+    private currentFunctionCallPreviousText: LlamaText = LlamaText([]);
+    private readonly currentFunctionCallCurrentPartTokens: Token[] = [];
+    private functionEvaluationFunctionName: string = "";
+    private currentFunctionCallPreviousPartLeftoverText: string = "";
+    private removedStartTextToIgnore: boolean = false;
 
     public generatedTokens = 0;
     public isFirstEvaluation = true;
-    public inFunctionEvaluationMode = false;
     public initiallyEngagedFunctionMode = false;
     public lastContextWindowHistory: ChatHistoryItem[];
     public lastHistoryCompressionMetadata: object | null | undefined;
+    private restartEvaluationIterator = false;
 
     // context shift loop
     public shouldContextShift = false;
-    public queuedChunkTokens: Token[] = [];
 
-    private contextWindowHistory: ChatHistoryItem[] = [];
-    public stopGenerationTriggers: LlamaText[] = [];
+    public canAvoidReloadingHistory: boolean = false;
     public contextWindowTokens: Token[] = [];
-    public newResolvedHistory: ChatHistoryItem[] = [];
-    public newHistoryCompressionMetadata: object | null | undefined = undefined;
+    public stopGenerationTriggers: LlamaText[] = [];
     public ignoreStartText: LlamaText[] = [];
     public functionCallInitiallyEngaged: boolean = false;
     public disengageInitiallyEngagedFunctionCall: LlamaText[] = [];
+    public userTextSuffix?: LlamaText = undefined;
 
     public tokens: Token[] = [];
     public contextWindowLastModelResponse: string = "";
@@ -1239,6 +1269,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             evaluationPriority = defaultEvaluationPriority,
             functions,
             documentFunctionParams,
+            maxParallelFunctionCalls,
             contextShift = defaultContextShiftOptions,
             customStopTriggers,
             lastEvaluationContextWindow: {
@@ -1265,6 +1296,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.evaluationPriority = evaluationPriority;
         this.functions = functions;
         this.documentFunctionParams = documentFunctionParams;
+        this.maxParallelFunctionCalls = maxParallelFunctionCalls;
         this.contextShift = contextShift;
         this.customStopTriggers = customStopTriggers;
         this.lastEvaluationContextWindowHistory = lastEvaluationContextWindowHistory;
@@ -1296,14 +1328,11 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.grammarEvaluationState = this.grammar != null
             ? new LlamaGrammarEvaluationState({grammar: this.grammar})
             : undefined;
-        this.functionsGrammar = this.functionsEnabled
-            ? new FunctionCallGrammar(this.llamaChat.model._llama, this.functions as NonNullable<Functions>, this.chatWrapper, false)
-            : undefined;
-        this.functionsEvaluationState = (this.functionsEnabled && this.functionsGrammar != null)
-            ? new LlamaGrammarEvaluationState({
-                grammar: this.functionsGrammar
-            })
+        this.functionNameGrammar = this.functionsEnabled
+            ? new FunctionCallNameGrammar(this.llamaChat.model._llama, this.functions as NonNullable<Functions>, this.chatWrapper)
             : undefined;
+        this.functionsGrammar = undefined;
+        this.functionsEvaluationState = undefined;
 
         this.lastContextWindowHistory = this.resolvedHistory;
         this.lastHistoryCompressionMetadata = this.resolvedContextShift;
@@ -1317,7 +1346,15 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
 
         if (this.functions != null && Object.keys(this.functions).length > 0)
-            this.functionSyntaxStartDetector.addStopTrigger([this.chatWrapper.settings.functions.call.prefix]);
+            this.functionSyntaxStartDetector.addStopTrigger(
+                StopGenerationDetector.resolveLlamaTextTrigger(
+                    LlamaText([
+                        this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
+                        this.chatWrapper.settings.functions.call.prefix
+                    ]),
+                    this.llamaChat.model.tokenizer
+                )
+            );
 
         this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
     }
@@ -1391,23 +1428,28 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         );
     }
 
-    public removeFoundStartIgnoreTextsFromPendingTokens() {
-        if (this.res.length === 0 && this.pendingTokens.length > 0) {
+    public removeFoundStartIgnoreTextsFromPendingTokens(forceRemove: boolean = false) {
+        if (!this.removedStartTextToIgnore && this.res.length === 0 && this.pendingTokens.length > 0 &&
+            this.ignoreStartTextDetector.hasTriggeredStops && (forceRemove || !this.ignoreStartTextDetector.hasInProgressStops)
+        ) {
             this.ignoreStartTextDetector.clearInProgressStops();
             this.ignoreStartTextDetector.clearTriggeredStops();
 
             let mostExhaustiveTriggeredStops: ReturnType<typeof this.ignoreStartTextDetector.getTriggeredStops> | null = null;
+            let mostExhaustiveTriggeredStopsLeftoverTokens: Token[] = [];
 
             for (let i = 0; i < this.pendingTokens.length; i++) {
                 this.ignoreStartTextDetector.recordGeneration({
                     text: this.llamaChat.model.detokenize([this.pendingTokens[i]]),
                     tokens: [this.pendingTokens[i]],
-                    startNewChecks: i === 0
+                    startNewChecks: i === 0,
+                    triggerMustStartWithGeneration: true
                 });
 
                 if (this.ignoreStartTextDetector.hasTriggeredStops) {
                     mostExhaustiveTriggeredStops = this.ignoreStartTextDetector.getTriggeredStops();
                     this.ignoreStartTextDetector.clearTriggeredStops();
+                    mostExhaustiveTriggeredStopsLeftoverTokens = this.pendingTokens.slice(i + 1);
                 } else if (!this.ignoreStartTextDetector.hasInProgressStops)
                     break;
             }
@@ -1425,7 +1467,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                         })
                         .flat(1);
 
-                    const newPendingTokens = mostExhaustiveTriggeredStop.remainingGenerations
+                    const newPendingTokens = [
+                        ...mostExhaustiveTriggeredStop.remainingGeneration,
+                        mostExhaustiveTriggeredStopsLeftoverTokens
+                    ]
                         .map((generation) => {
                             if (typeof generation === "string")
                                 return this.llamaChat.model.tokenize(generation, false, "trimLeadingSpace");
@@ -1435,6 +1480,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                         .flat(1);
                     this.pendingTokens.length = 0;
                     this.pendingTokens.push(...newPendingTokens);
+                    this.removedStartTextToIgnore = true;
                 }
             }
         }
@@ -1443,56 +1489,104 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     public startTokenLoop() {
         this.ensureNotAborted();
         this.shouldContextShift = false;
-        this.queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
     }
 
-    public async loadContextWindow(resolvedHistory: ChatHistoryItem[], endWithUserText: boolean = false) {
-        const {
-            history: contextWindowHistory,
-            stopGenerationTriggers,
-            tokens: contextWindowTokens,
-            newResolvedHistory,
-            newHistoryCompressionMetadata,
-            ignoreStartText,
-            functionCallInitiallyEngaged,
-            disengageInitiallyEngagedFunctionCall,
-            userTextSuffix
-        } = await getContextWindow({
-            resolvedHistory: resolvedHistory,
-            resolvedContextShift: this.resolvedContextShift,
-            lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
-            pendingTokensCount: this.ignoredStartTextTokens.length + this.pendingTokens.length + this.queuedChunkTokens.length,
-            isFirstEvaluation: this.isFirstEvaluation,
-            chatWrapper: this.chatWrapper,
-            lastEvaluationContextWindowHistory: this.lastEvaluationContextWindowHistory,
-            minimumOverlapPercentageToPreventContextShift: this.minimumOverlapPercentageToPreventContextShift,
-            sequence: this.llamaChat.sequence,
-            minFreeContextTokens: 1,
-            functions: this.functionsEnabled ? this.functions : undefined,
-            documentFunctionParams: this.documentFunctionParams,
-            endWithUserText
-        });
+    private getContextWindowFunctionCallsTokens() {
+        if (this.functionEvaluationMode === false)
+            return [];
+        else if (this.functionEvaluationMode === "prefixOrDisengage")
+            return [
+                ...LlamaText(this.currentFunctionCallPreviousText).tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace"),
+                ...this.currentFunctionCallCurrentPartTokens
+            ];
 
-        this.contextWindowHistory = contextWindowHistory;
-        this.stopGenerationTriggers = stopGenerationTriggers;
-        this.contextWindowTokens = contextWindowTokens;
-        this.newResolvedHistory = newResolvedHistory;
-        this.newHistoryCompressionMetadata = newHistoryCompressionMetadata;
-        this.ignoreStartText = ignoreStartText;
-        this.functionCallInitiallyEngaged = functionCallInitiallyEngaged;
-        this.disengageInitiallyEngagedFunctionCall = disengageInitiallyEngagedFunctionCall;
+        const text: (LlamaText | string)[] = [];
+        if (this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix != null)
+            text.push(this.chatWrapper.settings.functions.parallelism.call.sectionPrefix);
 
-        this.ensureNotAborted();
+        for (let i = 0; i < this.resFunctionCalls.length; i++) {
+            const call = this.resFunctionCalls[i];
+
+            if (i > 0)
+                text.push(this.chatWrapper.settings.functions?.parallelism?.call?.betweenCalls ?? "");
+
+            text.push(call.raw);
+        }
 
-        this.tokens = [...this.contextWindowTokens, ...this.ignoredStartTextTokens, ...this.pendingTokens, ...this.queuedChunkTokens];
-        this.resolvedHistory = this.newResolvedHistory;
-        this.lastHistoryCompressionMetadata = this.newHistoryCompressionMetadata;
-        this.lastContextWindowHistory = this.contextWindowHistory;
-        this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(this.contextWindowHistory);
-        this.contextWindowsRes = [];
+        text.push(this.currentFunctionCallPreviousText);
+
+        return [
+            ...LlamaText(text).tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace"),
+            ...this.currentFunctionCallCurrentPartTokens
+        ];
+    }
+
+    public async loadContextWindow(
+        resolvedHistory: ChatHistoryItem[],
+        endWithUserText: boolean = false,
+        avoidReloadingHistory: boolean = false
+    ): Promise<{userTextSuffix?: LlamaText}> {
+        const queuedChunkTokens = this.streamRegulator.getAllQueuedChunkTokens();
+        const functionCallsTokens = this.getContextWindowFunctionCallsTokens();
+
+        if (!avoidReloadingHistory || !this.canAvoidReloadingHistory || !this.llamaChat.sequence.isLoadedToMemory) {
+            const {
+                history: contextWindowHistory,
+                stopGenerationTriggers,
+                tokens: contextWindowTokens,
+                newResolvedHistory,
+                newHistoryCompressionMetadata,
+                ignoreStartText,
+                functionCallInitiallyEngaged,
+                disengageInitiallyEngagedFunctionCall,
+                userTextSuffix
+            } = await getContextWindow({
+                resolvedHistory: resolvedHistory,
+                resolvedContextShift: this.resolvedContextShift,
+                lastHistoryCompressionMetadata: this.lastHistoryCompressionMetadata,
+                pendingTokensCount: this.pendingTokens.length + queuedChunkTokens.length + functionCallsTokens.length,
+                isFirstEvaluation: this.isFirstEvaluation,
+                chatWrapper: this.chatWrapper,
+                lastEvaluationContextWindowHistory: this.lastEvaluationContextWindowHistory,
+                minimumOverlapPercentageToPreventContextShift: this.minimumOverlapPercentageToPreventContextShift,
+                sequence: this.llamaChat.sequence,
+                minFreeContextTokens: 1,
+                functions: this.functionsEnabled ? this.functions : undefined,
+                documentFunctionParams: this.documentFunctionParams,
+                endWithUserText
+            });
+
+            this.ensureNotAborted();
+
+            this.contextWindowTokens = contextWindowTokens;
+            this.stopGenerationTriggers = stopGenerationTriggers;
+            this.ignoreStartText = ignoreStartText;
+            this.functionCallInitiallyEngaged = functionCallInitiallyEngaged;
+            this.disengageInitiallyEngagedFunctionCall = disengageInitiallyEngagedFunctionCall;
+            this.userTextSuffix = userTextSuffix;
+
+            this.resolvedHistory = newResolvedHistory;
+            this.lastHistoryCompressionMetadata = newHistoryCompressionMetadata;
+            this.lastContextWindowHistory = contextWindowHistory;
+            this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(contextWindowHistory);
+
+            this.canAvoidReloadingHistory = true;
+            this.contextWindowsRes = [];
+        }
+
+        this.tokens = [
+            ...this.contextWindowTokens,
+            ...this.ignoredStartTextTokens,
+            ...this.pendingTokens,
+            ...queuedChunkTokens,
+            ...functionCallsTokens
+        ];
+
+        if (avoidReloadingHistory && this.tokens.length >= this.llamaChat.sequence.context.contextSize - 1)
+            return await this.loadContextWindow(resolvedHistory, endWithUserText, false);
 
         return {
-            userTextSuffix
+            userTextSuffix: this.userTextSuffix
         };
     }
 
@@ -1501,12 +1595,6 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             .map((stopTrigger) => this.ignoreStartTextDetector.addStopTrigger(stopTrigger));
     }
 
-    public addFunctionSyntaxEndTriggersFromFunctionsGrammar() {
-        if (this.functionsGrammar != null)
-            StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
-                .map((stopTrigger) => this.functionSyntaxEndDetector.addStopTrigger(stopTrigger));
-    }
-
     public addStopGenerationTriggersFromChatWrapper() {
         StopGenerationDetector.resolveStopTriggers(this.stopGenerationTriggers, this.llamaChat.model.tokenizer)
             .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
@@ -1514,21 +1602,464 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
 
     public initFunctions() {
         this.initiallyEngagedFunctionMode = this.functionCallInitiallyEngaged;
-        StopGenerationDetector.resolveStopTriggers(this.disengageInitiallyEngagedFunctionCall, this.llamaChat.model.tokenizer)
-            .map((stopTrigger) => this.disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
 
         if (this.initiallyEngagedFunctionMode) {
-            this.inFunctionEvaluationMode = true;
-            this.functionsGrammar = new FunctionCallGrammar(
-                this.llamaChat.model._llama,
-                this.functions as NonNullable<Functions>,
-                this.chatWrapper,
-                true
+            StopGenerationDetector.resolveStopTriggers(this.disengageInitiallyEngagedFunctionCall, this.llamaChat.model.tokenizer)
+                .map((stopTrigger) => this.disengageInitiallyEngagedFunctionMode.addStopTrigger(stopTrigger));
+
+            if (this.disengageInitiallyEngagedFunctionMode.hasTriggers) {
+                this.functionEvaluationMode = "prefixOrDisengage";
+                this.functionsGrammar = undefined;
+                this.functionsEvaluationState = undefined;
+            } else {
+                this.functionEvaluationMode = "functionName";
+            }
+
+            this.restartEvaluationIterator = true;
+        }
+    }
+
+    public async enterFunctionCallingLoop(loadContextWindow: () => Promise<void>) {
+        if (!this.functionsEnabled) {
+            this.functionEvaluationMode = false;
+            return undefined;
+        }
+
+        // eslint-disable-next-line no-constant-condition
+        while (true) {
+            if (this.functionEvaluationMode === "prefixOrDisengage") {
+                this.functionsGrammar = undefined;
+                this.functionsEvaluationState = undefined;
+                this.currentFunctionCallPreviousText = LlamaText([]);
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+
+                const prefixTokens = LlamaText(this.chatWrapper.settings.functions.call.prefix)
+                    .tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace");
+                const prefixDetector = new StopGenerationDetector();
+                const afterPrefixLeftoverTokens: Token[] = [];
+                prefixDetector.addStopTrigger(
+                    StopGenerationDetector.resolveLlamaTextTrigger(
+                        LlamaText(this.chatWrapper.settings.functions.call.prefix),
+                        this.llamaChat.model.tokenizer
+                    )
+                );
+
+                for (const prefixToken of prefixTokens) {
+                    const tokens = [prefixToken];
+                    const text = this.llamaChat.model.detokenize(tokens);
+                    const disregardedPossibilities = this.disengageInitiallyEngagedFunctionMode
+                        .getDisregardedPossibilitiesCountForAGeneration({
+                            text,
+                            tokens,
+                            startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 0
+                        });
+
+                    if (disregardedPossibilities > 0)
+                        break;
+
+                    this.currentFunctionCallCurrentPartTokens.push(prefixToken);
+
+                    this.disengageInitiallyEngagedFunctionMode.recordGeneration({
+                        text: text,
+                        tokens: tokens,
+                        startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
+                        triggerMustStartWithGeneration: true
+                    });
+
+                    if (prefixDetector.hasTriggeredStops)
+                        afterPrefixLeftoverTokens.push(prefixToken);
+                    else
+                        prefixDetector.recordGeneration({
+                            text: text,
+                            tokens: tokens,
+                            startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
+                            triggerMustStartWithGeneration: true
+                        });
+                }
+
+                for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
+                    const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
+                    if (stopGenerationTriggerRes != null)
+                        return stopGenerationTriggerRes;
+
+                    this.currentFunctionCallCurrentPartTokens.push(token);
+
+                    this.disengageInitiallyEngagedFunctionMode.recordGeneration({
+                        text: this.currentText,
+                        tokens: this.currentTokens,
+                        startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
+                        triggerMustStartWithGeneration: true
+                    });
+
+                    if (prefixDetector.hasTriggeredStops)
+                        afterPrefixLeftoverTokens.push(token);
+                    else
+                        prefixDetector.recordGeneration({
+                            text: this.currentText,
+                            tokens: this.currentTokens,
+                            startNewChecks: this.currentFunctionCallCurrentPartTokens.length === 1,
+                            triggerMustStartWithGeneration: true
+                        });
+
+                    if (this.disengageInitiallyEngagedFunctionMode.hasTriggeredStops ||
+                        !this.disengageInitiallyEngagedFunctionMode.hasInProgressStops
+                    )
+                        break;
+                }
+
+                const abortRes = this.handleAbortTrigger("model");
+                if (abortRes != null)
+                    return abortRes;
+
+                if (this.disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
+                    for (const token of this.currentFunctionCallCurrentPartTokens) {
+                        this.currentToken = token;
+                        this.currentTokens = [this.currentToken];
+                        this.currentText = this.llamaChat.model.detokenize(this.currentTokens);
+
+                        this.currentQueuedTokenRelease = this.streamRegulator.addChunk({
+                            tokens: this.currentTokens,
+                            text: this.currentText
+                        });
+                        this.recordStopGenerationEvaluation();
+                    }
+
+                    this.currentFunctionCallCurrentPartTokens.length = 0;
+                    this.functionEvaluationMode = false;
+                    return undefined;
+                }
+
+                if (prefixDetector.hasTriggeredStops) {
+                    const triggeredStops = prefixDetector.getTriggeredStops();
+                    const firstRemainingGenerationAfterStop = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
+                    this.currentFunctionCallPreviousPartLeftoverText = StopGenerationDetector.detokenizeRemainingGeneration(
+                        firstRemainingGenerationAfterStop,
+                        this.llamaChat.model.detokenize
+                    ) + this.llamaChat.model.detokenize(afterPrefixLeftoverTokens);
+                } else
+                    this.currentFunctionCallPreviousPartLeftoverText = "";
+
+                this.functionEvaluationMode = "functionName";
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+
+                continue;
+            } else if (this.functionEvaluationMode === "functionName") {
+                const functionNameGenerationDoneDetector = new StopGenerationDetector();
+
+                this.currentFunctionCallPreviousText = LlamaText(this.chatWrapper.settings.functions.call.prefix);
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+                const functionNameGrammar = this.functionNameGrammar ?? new FunctionCallNameGrammar(
+                    this.llamaChat.model._llama,
+                    this.functions as NonNullable<Functions>,
+                    this.chatWrapper
+                );
+                this.functionsGrammar = functionNameGrammar;
+                this.functionsEvaluationState = new LlamaGrammarEvaluationState({
+                    grammar: this.functionsGrammar
+                });
+
+                StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
+                    .map((stopTrigger) => functionNameGenerationDoneDetector.addStopTrigger(stopTrigger));
+
+                if (this.currentFunctionCallPreviousPartLeftoverText !== "") {
+                    const validFunctionNames = Object.keys(this.functions as NonNullable<Functions>);
+                    const hasAnyFunctionStartWithLeftover = validFunctionNames.some(
+                        (functionName) => functionName.startsWith(this.currentFunctionCallPreviousPartLeftoverText)
+                    );
+
+                    if (hasAnyFunctionStartWithLeftover) {
+                        const leftoverTokens = this.llamaChat.model.tokenize(this.currentFunctionCallPreviousPartLeftoverText, false, "trimLeadingSpace");
+                        this.currentFunctionCallPreviousPartLeftoverText = "";
+
+                        for (const leftoverToken of leftoverTokens) {
+                            const canBeNextToken = this.llamaChat.context._canBeNextTokenForGrammarEvaluationState(
+                                this.functionsEvaluationState,
+                                leftoverToken
+                            );
+
+                            if (!canBeNextToken)
+                                break;
+
+                            this.llamaChat.context._acceptTokenOnGrammarEvaluationState(this.functionsEvaluationState, leftoverToken);
+                            this.currentFunctionCallCurrentPartTokens.push(leftoverToken);
+                            functionNameGenerationDoneDetector.recordGeneration({
+                                text: this.llamaChat.model.detokenize([leftoverToken]),
+                                tokens: [leftoverToken]
+                            });
+                        }
+                    }
+                }
+
+                for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
+                    const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
+                    if (stopGenerationTriggerRes != null)
+                        return stopGenerationTriggerRes;
+
+                    this.currentFunctionCallCurrentPartTokens.push(token);
+
+                    functionNameGenerationDoneDetector.recordGeneration({
+                        text: this.currentText,
+                        tokens: this.currentTokens
+                    });
+
+                    if (functionNameGenerationDoneDetector.hasTriggeredStops)
+                        break;
+                }
+
+                const abortRes = this.handleAbortTrigger("model");
+                if (abortRes != null)
+                    return abortRes;
+
+                const functionCallNameText = this.llamaChat.model.detokenize(this.currentFunctionCallCurrentPartTokens);
+                const functionName = functionNameGrammar.parseFunctionName(functionCallNameText);
+
+                this.functionEvaluationFunctionName = functionName;
+                this.functionEvaluationMode = "params";
+                continue;
+            } else if (this.functionEvaluationMode === "params") {
+                this.currentFunctionCallPreviousText = LlamaText([
+                    this.chatWrapper.settings.functions.call.prefix,
+                    this.functionEvaluationFunctionName,
+                    this.chatWrapper.settings.functions.call.paramsPrefix
+                ]);
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+
+                let params: any = undefined;
+                let paramsText: string = "";
+
+                const functionDefinition = (this.functions as NonNullable<Functions>)[this.functionEvaluationFunctionName];
+                if (functionDefinition == null)
+                    throw new Error(`Function "${this.functionEvaluationFunctionName}" is not provided in the functions object`);
+                else if (functionDefinition.params == null) {
+                    params = undefined;
+                    paramsText = "";
+                } else {
+                    const functionParamsGenerationDoneDetector = new StopGenerationDetector();
+
+                    const functionParamsGrammar = new FunctionCallParamsGrammar(
+                        this.llamaChat.model._llama,
+                        this.functions as NonNullable<Functions>,
+                        this.chatWrapper,
+                        this.functionEvaluationFunctionName,
+                        functionDefinition.params
+                    );
+                    this.functionsGrammar = functionParamsGrammar;
+                    this.functionsEvaluationState = new LlamaGrammarEvaluationState({
+                        grammar: this.functionsGrammar
+                    });
+
+                    StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
+                        .map((stopTrigger) => functionParamsGenerationDoneDetector.addStopTrigger(stopTrigger));
+
+                    for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
+                        const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
+                        if (stopGenerationTriggerRes != null)
+                            return stopGenerationTriggerRes;
+
+                        this.currentFunctionCallCurrentPartTokens.push(token);
+
+                        functionParamsGenerationDoneDetector.recordGeneration({
+                            text: this.currentText,
+                            tokens: this.currentTokens
+                        });
+
+                        if (functionParamsGenerationDoneDetector.hasTriggeredStops)
+                            break;
+                    }
+
+                    const abortRes = this.handleAbortTrigger("model");
+                    if (abortRes != null)
+                        return abortRes;
+
+                    const functionCallParamsText = this.llamaChat.model.detokenize(this.currentFunctionCallCurrentPartTokens);
+                    const parsedFunctionParams = functionParamsGrammar.parseParams(functionCallParamsText);
+                    params = parsedFunctionParams.params;
+                    paramsText = parsedFunctionParams.raw;
+                }
+
+                const functionCallText = LlamaText([
+                    this.chatWrapper.settings.functions.call.prefix,
+                    this.functionEvaluationFunctionName,
+                    this.chatWrapper.settings.functions.call.paramsPrefix,
+                    paramsText,
+                    this.chatWrapper.settings.functions.call.suffix
+                ]);
+                this.resFunctionCalls.push({
+                    functionName: this.functionEvaluationFunctionName,
+                    params,
+                    raw: functionCallText
+                });
+                this.currentFunctionCallPreviousText = LlamaText([]);
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+                this.functionEvaluationFunctionName = "";
+
+                if (this.chatWrapper.settings.functions.parallelism == null || (
+                    this.maxParallelFunctionCalls != null && this.maxParallelFunctionCalls <= this.resFunctionCalls.length
+                )) {
+                    this.functionEvaluationMode = false;
+                    return this.returnFunctionCallResults();
+                }
+
+                this.functionEvaluationMode = "sectionSuffixOrBetweenCalls";
+                continue;
+            } else if (this.functionEvaluationMode === "sectionSuffixOrBetweenCalls") {
+                const sectionSuffixDetector = new StopGenerationDetector();
+                let isFirstToken = true;
+
+                this.functionsGrammar = undefined;
+                this.functionsEvaluationState = undefined;
+                this.currentFunctionCallPreviousText = LlamaText([]);
+                this.currentFunctionCallCurrentPartTokens.length = 0;
+
+                StopGenerationDetector.resolveStopTriggers([
+                    ...(
+                        this.chatWrapper.settings.functions.parallelism?.call?.sectionSuffix != null
+                            ? [this.chatWrapper.settings.functions.parallelism?.call?.sectionSuffix]
+                            : []
+                    ),
+                    LlamaText(new SpecialToken("EOS")),
+                    LlamaText(new SpecialToken("EOT"))
+                ], this.llamaChat.model.tokenizer)
+                    .map((stopTrigger) => sectionSuffixDetector.addStopTrigger(stopTrigger));
+
+                for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
+                    const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
+                    if (stopGenerationTriggerRes != null)
+                        return stopGenerationTriggerRes;
+
+                    this.currentFunctionCallCurrentPartTokens.push(token);
+
+                    sectionSuffixDetector.recordGeneration({
+                        text: this.currentText,
+                        tokens: this.currentTokens,
+                        startNewChecks: isFirstToken,
+                        triggerMustStartWithGeneration: true
+                    });
+
+                    isFirstToken = false;
+
+                    if (sectionSuffixDetector.hasTriggeredStops || !sectionSuffixDetector.hasInProgressStops)
+                        break;
+                }
+
+                const abortRes = this.handleAbortTrigger("model");
+                if (abortRes != null)
+                    return abortRes;
+
+                if (sectionSuffixDetector.hasTriggeredStops) {
+                    this.functionEvaluationMode = false;
+                    return this.returnFunctionCallResults();
+                }
+
+                this.functionEvaluationMode = "functionName";
+                this.initiallyEngagedFunctionMode = false;
+                continue;
+            }
+
+            break;
+        }
+
+        return undefined;
+    }
+
+    public returnFunctionCallResults(): LlamaChatResponse<Functions> | undefined {
+        if (this.resFunctionCalls.length > 0) {
+            this.stopGenerationDetector.clearInProgressStops();
+            this.customStopGenerationTriggersDetector.clearInProgressStops();
+            this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
+
+            const triggeredStops = this.functionSyntaxStartDetector.getTriggeredStops();
+            const partiallyFreeTokens = this.streamRegulator.getPartiallyFreeChunk(this.llamaChat.model.tokenizer);
+            const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
+                triggeredStops,
+                partiallyFreeTokens,
+                this.llamaChat.model.tokenizer
             );
-            this.functionsEvaluationState = new LlamaGrammarEvaluationState({
-                grammar: this.functionsGrammar
-            });
+            this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
+
+            this.removeFoundStartIgnoreTextsFromPendingTokens(true);
+
+            if (this.pendingTokens.length > 0)
+                this.onToken?.(this.pendingTokens.slice());
+
+            this.res.push(...this.pendingTokens);
+            this.contextWindowsRes.push(...this.pendingTokens);
+            this.pendingTokens.length = 0;
+
+            let modelResponse = this.llamaChat.model.detokenize(this.res);
+            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
+
+            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
+                modelResponse = modelResponse.trimEnd();
+                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
+            }
+
+            return {
+                response: modelResponse,
+                lastEvaluation: {
+                    contextWindow: setLastTextInChatHistory(
+                        "model",
+                        this.lastContextWindowHistory,
+                        this.contextWindowLastModelResponse + contextWindowModelResponse
+                    ),
+                    cleanHistory: setLastTextInChatHistory(
+                        "model",
+                        this.resolvedHistory,
+                        this.lastModelResponse + modelResponse
+                    ),
+                    contextShiftMetadata: this.lastHistoryCompressionMetadata
+                },
+
+                functionCalls: this.resFunctionCalls.map((functionCall) => {
+                    return {
+                        functionName: functionCall.functionName,
+                        params: functionCall.params,
+                        raw: functionCall.raw.toJSON()
+                    } satisfies LlamaChatResponseFunctionCall<NonNullable<Functions>>;
+                }) satisfies LlamaChatResponseFunctionCall<NonNullable<Functions>>[] as any, // prevent infinite TS type instantiation
+
+                metadata: {
+                    stopReason: "functionCalls"
+                }
+            };
         }
+
+        return undefined;
+    }
+
+    public async *evaluateWithContextShift(loadContextWindow: () => Promise<void>): AsyncGenerator<Token> {
+        while (true) {
+            this.startTokenLoop();
+            await loadContextWindow();
+            await this.alignCurrentSequenceStateWithCurrentTokens();
+
+            await this.createNewEvaluationIterator();
+            while (await this.iterateEvaluation()) {
+                if (this.currentToken == null)
+                    break;
+
+                yield this.currentToken;
+
+                if (this.shouldAbort)
+                    return;
+
+                if (this.updateShouldContextShift())
+                    break;
+
+                if (this.restartEvaluationIterator) {
+                    await this.createNewEvaluationIterator();
+                }
+            }
+
+            this.isFirstEvaluation = false;
+
+            if (this.shouldContextShift)
+                continue;
+
+            break;
+        }
+
+        throw new Error("The context size is too small to generate a response");
     }
 
     public async alignCurrentSequenceStateWithCurrentTokens() {
@@ -1563,13 +2094,14 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             await this.evaluationIterator.return();
 
         this.currentIterationReplacementToken = undefined;
+        this.restartEvaluationIterator = false;
         this.evaluationIterator = this.llamaChat.sequence.evaluate(this.tokens, removeNullFields({
             temperature: this.temperature,
             minP: this.minP,
             topK: this.topK,
             topP: this.topP,
             grammarEvaluationState: () => {
-                if (this.inFunctionEvaluationMode)
+                if (this.functionEvaluationMode !== false)
                     return this.functionsEvaluationState;
 
                 return this.grammarEvaluationState;
@@ -1597,10 +2129,14 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             this.currentToken = this.currentIteration.value;
             this.currentTokens = [this.currentToken];
             this.currentText = this.llamaChat.model.detokenize(this.currentTokens);
-            this.currentQueuedTokenRelease = this.streamRegulator.addChunk({
-                tokens: this.currentTokens,
-                text: this.currentText
-            });
+
+            if (this.functionEvaluationMode === false)
+                this.currentQueuedTokenRelease = this.streamRegulator.addChunk({
+                    tokens: this.currentTokens,
+                    text: this.currentText
+                });
+            else
+                this.currentQueuedTokenRelease = undefined;
 
             return true;
         }
@@ -1620,81 +2156,18 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         }
     }
 
-    public trackGenerationForDisengageInitiallyEngagedFunctionMode() {
-        if (this.initiallyEngagedFunctionMode)
-            this.disengageInitiallyEngagedFunctionMode.recordGeneration({
-                text: this.currentText,
-                tokens: this.currentTokens,
-                startNewChecks: this.generatedTokens === 1
-            });
-    }
-
-    public trackFunctionSyntaxStart() {
+    public detectAndHandleFunctionStartSyntax() {
         this.functionSyntaxStartDetector.recordGeneration({
             text: this.currentText,
             tokens: this.currentTokens,
             queuedTokenRelease: this.currentQueuedTokenRelease
         });
-    }
-
-    public handleInitiallyEngagedFunctionModeFunctionDetection() {
-        if (this.initiallyEngagedFunctionMode && this.disengageInitiallyEngagedFunctionMode.hasTriggeredStops) {
-            this.initiallyEngagedFunctionMode = false;
-
-            let shouldStopFunctionEvaluationMode = !this.functionSyntaxStartDetector.hasTriggeredStops;
-
-            if (!shouldStopFunctionEvaluationMode && this.functionsEnabled && this.functionsGrammar != null) {
-                const functionCallText = this.llamaChat.model.detokenize([...this.functionCallTokens, ...this.currentTokens]);
-
-                try {
-                    const functionName = this.functionsGrammar.parseFunctionNameFromPartialCall(functionCallText, {
-                        enableInternalBuiltinFunctions: true,
-                        initialFunctionCallEngaged: true
-                    });
-
-                    const internalBuiltinFunctions =
-                        this.chatWrapper.getInternalBuiltinFunctions({initialFunctionCallEngaged: true});
-                    if (internalBuiltinFunctions[functionName] != null) {
-                        shouldStopFunctionEvaluationMode = true;
-                    }
-                } catch (err) {
-                    if (!(err instanceof LlamaFunctionCallValidationError))
-                        throw err;
-                }
-            }
-
-            if (shouldStopFunctionEvaluationMode) {
-                this.inFunctionEvaluationMode = false;
-                this.functionsGrammar = new FunctionCallGrammar(
-                    this.llamaChat.model._llama,
-                    this.functions as NonNullable<Functions>,
-                    this.chatWrapper,
-                    false
-                );
-                this.functionsEvaluationState = new LlamaGrammarEvaluationState({
-                    grammar: this.functionsGrammar
-                });
-
-                this.functionCallTokens.length = 0;
-
-                while (this.functionCallTokenSyntaxLocks.length > 0)
-                    this.functionCallTokenSyntaxLocks.shift()!.dispose();
-
-                this.functionSyntaxStartDetector.clearInProgressStops();
-                this.functionSyntaxStartDetector.clearTriggeredStops();
-
-                this.functionSyntaxEndDetector.clearInProgressStops();
-                this.functionSyntaxEndDetector.clearTriggeredStops();
-            }
-        }
-    }
 
-    public handleFunctionSyntax() {
-        if (this.currentQueuedTokenRelease != null && !this.inFunctionEvaluationMode && this.functionsEnabled &&
-            this.functionsGrammar != null && this.functionSyntaxStartDetector.hasTriggeredStops && this.functionsEvaluationState != null
+        if (this.currentQueuedTokenRelease != null && this.functionEvaluationMode === false && this.functionsEnabled &&
+            this.functionSyntaxStartDetector.hasTriggeredStops
         ) {
-            this.inFunctionEvaluationMode = true;
-            this.functionCallTokenSyntaxLocks.push(this.currentQueuedTokenRelease.createTextIndexLock(0));
+            this.functionEvaluationMode = "functionName";
+            this.currentQueuedTokenRelease.createTextIndexLock(0);
 
             this.stopGenerationDetector.clearTriggeredStops();
             this.stopGenerationDetector.clearInProgressStops();
@@ -1713,133 +2186,27 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             );
             this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
 
-            const [firstRemainingGenerationAfterStop] = triggeredStops
-                .map((stopTrigger) => stopTrigger.remainingGenerations)
-                .filter((remainingGenerations) => remainingGenerations.length > 0)
-                .flat(1);
-
-            const remainingTextAfterStop =
-                (firstRemainingGenerationAfterStop == null || firstRemainingGenerationAfterStop.length === 0)
-                    ? ""
-                    : typeof firstRemainingGenerationAfterStop === "string"
-                        ? firstRemainingGenerationAfterStop
-                        : this.llamaChat.model.detokenize(firstRemainingGenerationAfterStop);
-
-            this.functionCallTokens.push(...this.llamaChat.model.tokenize(this.chatWrapper.settings.functions.call.prefix, false, "trimLeadingSpace"));
-
-            for (const functionCallToken of this.functionCallTokens)
-                this.llamaChat.context._acceptTokenOnGrammarEvaluationState(this.functionsEvaluationState, functionCallToken);
-
-            // these tokens have to be verified that they match the function calling syntax grammar before they can be accepted,
-            // or the context state should be modified to not include the incompatible tokens
-            const remainingTextTokens = this.llamaChat.model.tokenize(remainingTextAfterStop, false, "trimLeadingSpace");
-            let unfitTokens: Token[] = [];
-
-            for (let i = 0; i < remainingTextTokens.length; i++) {
-                const remainingToken = remainingTextTokens[i];
-                const canBeNextToken = this.llamaChat.context._canBeNextTokenForGrammarEvaluationState(
-                    this.functionsEvaluationState,
-                    remainingToken
-                );
-
-                if (!canBeNextToken) {
-                    unfitTokens = remainingTextTokens.slice(i);
-                    break;
-                }
-
-                this.llamaChat.context._acceptTokenOnGrammarEvaluationState(this.functionsEvaluationState, remainingToken);
-                this.functionCallTokens.push(remainingToken);
-            }
-
-            if (unfitTokens.length > 0) {
-                const unfitTokensText = this.llamaChat.model.detokenize(unfitTokens); // the current token text must end with it
-                const currentTokenText = this.currentQueuedTokenRelease.text;
-                let replacementTokens: Token[];
-
-                if (!currentTokenText.endsWith(unfitTokensText)) {
-                    console.warn(getConsoleLogPrefix() + "The current token text does not end with the unfit function call syntax tokens text");
-                    replacementTokens = remainingTextTokens.slice(0, -unfitTokens.length);
-                } else {
-                    const newCurrentTokensText = currentTokenText.slice(0, -unfitTokensText.length);
-                    replacementTokens = this.llamaChat.model.tokenize(newCurrentTokensText, false, "trimLeadingSpace");
-                }
-
-                if (replacementTokens.length > 0) {
-                    this.currentIterationReplacementToken = replacementTokens[0];
-                    this.currentQueuedTokenRelease.modifyTokensAndText(
-                        replacementTokens,
-                        this.llamaChat.model.detokenize([this.currentIterationReplacementToken])
-                    );
-                }
-            }
-        } else if (this.inFunctionEvaluationMode) {
-            this.functionCallTokens.push(...this.currentTokens);
-
-            if (this.currentQueuedTokenRelease != null)
-                this.functionCallTokenSyntaxLocks.push(this.currentQueuedTokenRelease.createTextIndexLock(0));
-
-            this.functionSyntaxEndDetector.recordGeneration({
-                text: this.currentText,
-                tokens: this.currentTokens,
-                queuedTokenRelease: this.currentQueuedTokenRelease
-            });
-        }
-    }
-
-    public detectFunctionEndSyntax(lastHistoryItemType: "user" | "model"): LlamaChatResponse<Functions> | undefined {
-        if (this.inFunctionEvaluationMode && this.functionSyntaxEndDetector.hasTriggeredStops && this.functionsGrammar != null) {
-            const functionCallText = this.llamaChat.model.detokenize(this.functionCallTokens);
-            const functionCall = this.functionsGrammar.parseFunctionCall(functionCallText);
-
-            let modelResponse = this.llamaChat.model.detokenize(this.res);
-            let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
-
-            if (this.grammar?.trimWhitespaceSuffix || this.trimWhitespaceSuffix) {
-                modelResponse = modelResponse.trimEnd();
-                contextWindowModelResponse = contextWindowModelResponse.trimEnd();
-            }
-
-            return {
-                response: modelResponse,
-                lastEvaluation: {
-                    contextWindow: setLastTextInChatHistory(
-                        lastHistoryItemType,
-                        this.lastContextWindowHistory,
-                        this.contextWindowLastModelResponse + contextWindowModelResponse
-                    ),
-                    cleanHistory: setLastTextInChatHistory(
-                        lastHistoryItemType,
-                        this.resolvedHistory,
-                        this.lastModelResponse + modelResponse
-                    ),
-                    contextShiftMetadata: this.lastHistoryCompressionMetadata
-                },
-
-                // prevent infinite TS type instantiation
-                functionCall: functionCall satisfies LlamaChatResponseFunctionCall<NonNullable<Functions>> as any,
+            const firstRemainingGenerationAfterStop = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
+            const remainingTextAfterStop = StopGenerationDetector.detokenizeRemainingGeneration(
+                firstRemainingGenerationAfterStop,
+                this.llamaChat.model.detokenize
+            );
 
-                metadata: {
-                    stopReason: "functionCall"
-                }
-            };
+            this.currentFunctionCallPreviousPartLeftoverText = remainingTextAfterStop;
         }
-
-        return undefined;
     }
 
     public recordStopGenerationEvaluation() {
-        if (!this.inFunctionEvaluationMode) {
-            this.stopGenerationDetector.recordGeneration({
-                text: this.currentText,
-                tokens: this.currentTokens,
-                queuedTokenRelease: this.currentQueuedTokenRelease
-            });
-            this.customStopGenerationTriggersDetector.recordGeneration({
-                text: this.currentText,
-                tokens: this.currentTokens,
-                queuedTokenRelease: this.currentQueuedTokenRelease
-            });
-        }
+        this.stopGenerationDetector.recordGeneration({
+            text: this.currentText,
+            tokens: this.currentTokens,
+            queuedTokenRelease: this.currentQueuedTokenRelease
+        });
+        this.customStopGenerationTriggersDetector.recordGeneration({
+            text: this.currentText,
+            tokens: this.currentTokens,
+            queuedTokenRelease: this.currentQueuedTokenRelease
+        });
     }
 
     public popStreamRegulatorFreeTokens() {
@@ -1867,12 +2234,9 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             );
             this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
 
-            const [firstRemainingGenerationAfterStop] = triggeredStops
-                .map((stopTrigger) => stopTrigger.remainingGenerations)
-                .filter((remainingGenerations) => remainingGenerations.length > 0)
-                .flat(1);
+            const firstRemainingGenerationAfterStop = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
 
-            this.removeFoundStartIgnoreTextsFromPendingTokens();
+            this.removeFoundStartIgnoreTextsFromPendingTokens(true);
 
             if (this.pendingTokens.length > 0)
                 this.onToken?.(this.pendingTokens.slice());
@@ -2000,8 +2364,12 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return this.shouldContextShift;
     }
 
+    public get shouldAbort() {
+        return !!(this.signal?.aborted && this.stopOnAbortSignal);
+    }
+
     public handleAbortTrigger(lastHistoryItemType: "user" | "model") {
-        if (this.signal?.aborted && this.stopOnAbortSignal) {
+        if (this.shouldAbort && this.signal?.aborted && this.stopOnAbortSignal) {
             if (this.res.length === 0)
                 throw this.signal.reason;
 
diff --git a/src/evaluator/LlamaChat/utils/FunctionCallGrammar.ts b/src/evaluator/LlamaChat/utils/FunctionCallGrammar.ts
deleted file mode 100644
index bdf05ae6..00000000
--- a/src/evaluator/LlamaChat/utils/FunctionCallGrammar.ts
+++ /dev/null
@@ -1,239 +0,0 @@
-import {LlamaGrammar} from "../../LlamaGrammar.js";
-import {LlamaText} from "../../../utils/LlamaText.js";
-import {validateObjectAgainstGbnfSchema} from "../../../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
-import {ChatModelFunctions} from "../../../types.js";
-import {GbnfGrammarGenerator} from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
-import {getGbnfJsonTerminalForGbnfJsonSchema} from "../../../utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js";
-import {ChatWrapper} from "../../../ChatWrapper.js";
-import {GbnfGrammar} from "../../../utils/gbnfJson/terminals/GbnfGrammar.js";
-import {GbnfTerminal} from "../../../utils/gbnfJson/GbnfTerminal.js";
-import {GbnfOr} from "../../../utils/gbnfJson/terminals/GbnfOr.js";
-import {LlamaChatResponseFunctionCall} from "../LlamaChat.js";
-import {GbnfVerbatimText} from "../../../utils/gbnfJson/terminals/GbnfVerbatimText.js";
-import {Llama} from "../../../bindings/Llama.js";
-
-
-export class FunctionCallGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
-    private readonly _functions: Functions;
-    private readonly _chatWrapper: ChatWrapper;
-
-    public constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper, initialFunctionCallEngaged: boolean) {
-        const grammar = getGbnfGrammarForFunctionCalls(functions, chatWrapper, initialFunctionCallEngaged);
-
-        super(llama, {
-            grammar,
-            stopGenerationTriggers: [LlamaText(chatWrapper.settings.functions.call.suffix, "\n".repeat(4))],
-            trimWhitespaceSuffix: true
-        });
-
-        this._functions = functions;
-        this._chatWrapper = chatWrapper;
-
-        this._validateFunctions();
-    }
-
-    public parseFunctionCall(callText: string): LlamaChatResponseFunctionCall<Functions> {
-        if (this._chatWrapper.settings.functions.call.optionalPrefixSpace &&
-            !callText.startsWith(this._chatWrapper.settings.functions.call.prefix) && callText[0] === " "
-        )
-            callText = callText.slice(1);
-
-        if (!callText.startsWith(this._chatWrapper.settings.functions.call.prefix))
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to start with function call prefix from "${this._chatWrapper.wrapperName}" chat wrapper`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const paramsPrefixIndex = callText.indexOf(
-            this._chatWrapper.settings.functions.call.paramsPrefix,
-            this._chatWrapper.settings.functions.call.prefix.length
-        );
-
-        if (paramsPrefixIndex < 0)
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to contain params prefix from "${this._chatWrapper.wrapperName}" chat wrapper`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const functionName = callText.slice(
-            this._chatWrapper.settings.functions.call.prefix.length,
-            paramsPrefixIndex
-        ) as keyof Functions & string;
-
-        if (!Object.hasOwn(this._functions, functionName))
-            throw new LlamaFunctionCallValidationError(
-                `Function name "${functionName}" is not in the supplied functions object`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const functionSchema = this._functions[functionName];
-
-        const callSuffix = this._chatWrapper.settings.functions.call.suffix;
-        let callSuffixIndex = callText.lastIndexOf(callSuffix + "\n".repeat(4));
-
-        if (callSuffixIndex < 0)
-            callSuffixIndex = (callText + "\n".repeat(4)).lastIndexOf(callSuffix + "\n".repeat(4));
-
-        if (callSuffixIndex < 0 || callSuffixIndex < paramsPrefixIndex + this._chatWrapper.settings.functions.call.paramsPrefix.length)
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to end with function call suffix from "${this._chatWrapper.wrapperName}" chat wrapper`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const paramsString = callText.slice(
-            paramsPrefixIndex + this._chatWrapper.settings.functions.call.paramsPrefix.length,
-            callSuffixIndex
-        );
-
-        if (functionSchema.params == null && paramsString.trim().length !== 0)
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to not contain params string but got "${paramsString}"`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-        else if (functionSchema.params == null)
-            return {
-                functionName,
-                params: undefined as any,
-                raw: callText.slice(0, callSuffixIndex + callSuffix.length)
-            };
-
-        const params = JSON.parse(paramsString);
-
-        validateObjectAgainstGbnfSchema(params, functionSchema.params);
-
-        return {
-            functionName,
-            params: params as any, // prevent infinite TS type instantiation
-            raw: callText.slice(0, callSuffixIndex + callSuffix.length)
-        };
-    }
-
-    public parseFunctionNameFromPartialCall(callText: string, {
-        enableInternalBuiltinFunctions = false,
-        initialFunctionCallEngaged = false
-    }: {
-        enableInternalBuiltinFunctions?: boolean,
-        initialFunctionCallEngaged?: boolean
-    } = {}): keyof Functions & string {
-        if (this._chatWrapper.settings.functions.call.optionalPrefixSpace &&
-            !callText.startsWith(this._chatWrapper.settings.functions.call.prefix) && callText[0] === " "
-        )
-            callText = callText.slice(1);
-
-        if (!callText.startsWith(this._chatWrapper.settings.functions.call.prefix))
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to start with function call prefix from "${this._chatWrapper.wrapperName}" chat wrapper`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const paramsPrefixIndex = callText.indexOf(
-            this._chatWrapper.settings.functions.call.paramsPrefix,
-            this._chatWrapper.settings.functions.call.prefix.length
-        );
-
-        if (paramsPrefixIndex < 0)
-            throw new LlamaFunctionCallValidationError(
-                `Expected function call to contain params prefix from "${this._chatWrapper.wrapperName}" chat wrapper`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        const functionName = callText.slice(
-            this._chatWrapper.settings.functions.call.prefix.length,
-            paramsPrefixIndex
-        ) as keyof Functions & string;
-
-        let foundFunctionName = Object.hasOwn(this._functions, functionName);
-
-        if (!foundFunctionName && enableInternalBuiltinFunctions)
-            foundFunctionName ||= Object.hasOwn(
-                this._chatWrapper.getInternalBuiltinFunctions({initialFunctionCallEngaged}),
-                functionName
-            );
-
-        if (!foundFunctionName)
-            throw new LlamaFunctionCallValidationError(
-                `Function name "${functionName}" is not in the supplied functions object`,
-                this._functions,
-                this._chatWrapper,
-                callText
-            );
-
-        return functionName;
-    }
-
-    private _validateFunctions() {
-        for (const functionsName of Object.keys(this._functions)) {
-            if (functionsName.includes(" ") || functionsName.includes("\n") || functionsName.includes("\t"))
-                throw new Error(`Function name "${functionsName}" contains spaces, new lines or tabs`);
-        }
-    }
-}
-
-function getGbnfGrammarForFunctionCalls<const Functions extends ChatModelFunctions>(
-    functions: Functions, chatWrapper: ChatWrapper, initialFunctionCallEngaged: boolean
-): string {
-    const grammarGenerator = new GbnfGrammarGenerator();
-
-    const callGrammars: GbnfTerminal[] = [];
-
-    function addFunctionCallGrammar(functionName: string, functionSchema: ChatModelFunctions[string]) {
-        if (functionSchema.params != null) {
-            const paramsTerminal = getGbnfJsonTerminalForGbnfJsonSchema(functionSchema.params, grammarGenerator);
-
-            callGrammars.push(
-                new GbnfGrammar([
-                    new GbnfVerbatimText(functionName + chatWrapper.settings.functions.call.paramsPrefix).getGrammar(),
-                    paramsTerminal.resolve(grammarGenerator)
-                ])
-            );
-        } else
-            callGrammars.push(new GbnfVerbatimText(functionName + chatWrapper.settings.functions.call.paramsPrefix));
-    }
-
-    for (const [functionName, functionSchema] of Object.entries(functions))
-        addFunctionCallGrammar(functionName, functionSchema);
-
-    for (const [functionName, functionSchema] of Object.entries(chatWrapper.getInternalBuiltinFunctions({initialFunctionCallEngaged})))
-        addFunctionCallGrammar(functionName, functionSchema);
-
-    const callGrammar = new GbnfOr(callGrammars);
-
-    const rootTerminal = new GbnfGrammar([
-        ...(chatWrapper.settings.functions.call.optionalPrefixSpace ? ["[ ]?"] : []),
-        new GbnfVerbatimText(chatWrapper.settings.functions.call.prefix).getGrammar(),
-        callGrammar.resolve(grammarGenerator),
-        new GbnfVerbatimText(chatWrapper.settings.functions.call.suffix).getGrammar()
-    ]);
-
-    const rootGrammar = rootTerminal.getGrammar();
-
-    return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]".repeat(4) + " [\\n]*");
-}
-
-export class LlamaFunctionCallValidationError<const Functions extends ChatModelFunctions> extends Error {
-    public readonly functions: Functions;
-    public readonly chatWrapper: ChatWrapper;
-    public readonly callText: string;
-
-    public constructor(message: string, functions: Functions, chatWrapper: ChatWrapper, callText: string) {
-        super(message);
-
-        this.functions = functions;
-        this.chatWrapper = chatWrapper;
-        this.callText = callText;
-    }
-}
diff --git a/src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts b/src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts
new file mode 100644
index 00000000..d313cf27
--- /dev/null
+++ b/src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.ts
@@ -0,0 +1,87 @@
+import {LlamaGrammar} from "../../LlamaGrammar.js";
+import {LlamaText} from "../../../utils/LlamaText.js";
+import {ChatModelFunctions} from "../../../types.js";
+import {GbnfGrammarGenerator} from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
+import {ChatWrapper} from "../../../ChatWrapper.js";
+import {GbnfGrammar} from "../../../utils/gbnfJson/terminals/GbnfGrammar.js";
+import {GbnfTerminal} from "../../../utils/gbnfJson/GbnfTerminal.js";
+import {GbnfOr} from "../../../utils/gbnfJson/terminals/GbnfOr.js";
+import {GbnfVerbatimText} from "../../../utils/gbnfJson/terminals/GbnfVerbatimText.js";
+import {Llama} from "../../../bindings/Llama.js";
+import {LlamaFunctionCallValidationError} from "./LlamaFunctionCallValidationError.js";
+
+
+export class FunctionCallNameGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
+    private readonly _functions: Functions;
+    private readonly _chatWrapper: ChatWrapper;
+
+    public constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper) {
+        const grammar = getGbnfGrammarForFunctionName(functions, chatWrapper);
+
+        super(llama, {
+            grammar,
+            stopGenerationTriggers: [LlamaText("\n")],
+            trimWhitespaceSuffix: true
+        });
+
+        this._functions = functions;
+        this._chatWrapper = chatWrapper;
+
+        this._validateFunctions();
+    }
+
+    public parseFunctionName(generatedFunctionName: string): keyof Functions & string {
+        if (this._chatWrapper.settings.functions.call.optionalPrefixSpace && generatedFunctionName[0] === " ")
+            generatedFunctionName = generatedFunctionName.slice(1);
+
+        const newlineIndex = generatedFunctionName.indexOf("\n");
+
+        const functionName = generatedFunctionName.slice(
+            0,
+            newlineIndex < 0
+                ? generatedFunctionName.length
+                : newlineIndex
+        ) as keyof Functions & string;
+
+        if (!Object.hasOwn(this._functions, functionName))
+            throw new LlamaFunctionCallValidationError(
+                `Function name "${functionName}" is not in the supplied functions object`,
+                this._functions,
+                this._chatWrapper,
+                generatedFunctionName
+            );
+
+        return functionName;
+    }
+
+    private _validateFunctions() {
+        for (const functionsName of Object.keys(this._functions)) {
+            if (functionsName.includes(" ") || functionsName.includes("\n") || functionsName.includes("\t"))
+                throw new Error(`Function name "${functionsName}" contains spaces, new lines or tabs`);
+            else if (functionsName === "")
+                throw new Error("Function name cannot be an empty string");
+        }
+    }
+}
+
+function getGbnfGrammarForFunctionName<const Functions extends ChatModelFunctions>(
+    functions: Functions, chatWrapper: ChatWrapper
+): string {
+    const grammarGenerator = new GbnfGrammarGenerator();
+
+    const functionNameGrammars: GbnfTerminal[] = [];
+
+    for (const functionName of Object.keys(functions))
+        functionNameGrammars.push(new GbnfVerbatimText(functionName));
+
+    const callGrammar = new GbnfOr(functionNameGrammars);
+
+    const rootTerminal = new GbnfGrammar([
+        ...(chatWrapper.settings.functions.call.optionalPrefixSpace ? ["[ ]?"] : []),
+        callGrammar.resolve(grammarGenerator)
+    ]);
+
+    const rootGrammar = rootTerminal.getGrammar();
+
+    return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]");
+}
diff --git a/src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts b/src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts
new file mode 100644
index 00000000..61cd6bc2
--- /dev/null
+++ b/src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.ts
@@ -0,0 +1,72 @@
+import {LlamaGrammar} from "../../LlamaGrammar.js";
+import {LlamaText} from "../../../utils/LlamaText.js";
+import {validateObjectAgainstGbnfSchema} from "../../../utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
+import {ChatModelFunctions} from "../../../types.js";
+import {GbnfGrammarGenerator} from "../../../utils/gbnfJson/GbnfGrammarGenerator.js";
+import {getGbnfJsonTerminalForGbnfJsonSchema} from "../../../utils/gbnfJson/utils/getGbnfJsonTerminalForGbnfJsonSchema.js";
+import {ChatWrapper} from "../../../ChatWrapper.js";
+import {Llama} from "../../../bindings/Llama.js";
+import {GbnfJsonSchema} from "../../../utils/gbnfJson/types.js";
+import {LlamaFunctionCallValidationError} from "./LlamaFunctionCallValidationError.js";
+
+
+export class FunctionCallParamsGrammar<const Functions extends ChatModelFunctions> extends LlamaGrammar {
+    private readonly _functions: Functions;
+    private readonly _chatWrapper: ChatWrapper;
+    private readonly _functionName: string;
+    private readonly _paramsSchema: GbnfJsonSchema;
+
+    public constructor(llama: Llama, functions: Functions, chatWrapper: ChatWrapper, functionName: string, paramsSchema: GbnfJsonSchema) {
+        const grammar = getGbnfGrammarForFunctionParams(paramsSchema);
+
+        super(llama, {
+            grammar,
+            stopGenerationTriggers: [LlamaText("\n".repeat(4))],
+            trimWhitespaceSuffix: true
+        });
+
+        this._functions = functions;
+        this._chatWrapper = chatWrapper;
+        this._functionName = functionName;
+        this._paramsSchema = paramsSchema;
+    }
+
+    public parseParams(callText: string) {
+        const endIndex = callText.lastIndexOf("\n".repeat(4));
+
+        if (endIndex < 0)
+            throw new LlamaFunctionCallValidationError(
+                `Expected function call params for function "${this._functionName}" to end with stop generation trigger`,
+                this._functions,
+                this._chatWrapper,
+                callText
+            );
+
+        const paramsString = callText.slice(0, endIndex);
+
+        if (paramsString.trim().length === 0)
+            throw new LlamaFunctionCallValidationError(
+                `Expected function call params for function "${this._functionName}" to not be empty`,
+                this._functions,
+                this._chatWrapper,
+                callText
+            );
+
+        const params = JSON.parse(paramsString);
+
+        validateObjectAgainstGbnfSchema(params, this._paramsSchema);
+
+        return {
+            params: params as any, // prevent infinite TS type instantiation
+            raw: paramsString
+        };
+    }
+}
+
+function getGbnfGrammarForFunctionParams(paramsSchema: GbnfJsonSchema): string {
+    const grammarGenerator = new GbnfGrammarGenerator();
+    const rootTerminal = getGbnfJsonTerminalForGbnfJsonSchema(paramsSchema, grammarGenerator);
+    const rootGrammar = rootTerminal.getGrammar(grammarGenerator);
+
+    return grammarGenerator.generateGbnfFile(rootGrammar + ` "${"\\n".repeat(4)}"`);
+}
diff --git a/src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts b/src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts
new file mode 100644
index 00000000..f84ccd2e
--- /dev/null
+++ b/src/evaluator/LlamaChat/utils/LlamaFunctionCallValidationError.ts
@@ -0,0 +1,17 @@
+import {ChatModelFunctions} from "../../../types.js";
+import {ChatWrapper} from "../../../ChatWrapper.js";
+
+
+export class LlamaFunctionCallValidationError<const Functions extends ChatModelFunctions> extends Error {
+    public readonly functions: Functions;
+    public readonly chatWrapper: ChatWrapper;
+    public readonly callText: string;
+
+    public constructor(message: string, functions: Functions, chatWrapper: ChatWrapper, callText: string) {
+        super(message);
+
+        this.functions = functions;
+        this.chatWrapper = chatWrapper;
+        this.callText = callText;
+    }
+}
diff --git a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
index 73f40e06..c9ec6410 100644
--- a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
+++ b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
@@ -1,7 +1,8 @@
 import {ChatHistoryItem, Tokenizer} from "../../../../types.js";
 import {findCharacterRemovalCountToFitChatHistoryInContext} from "../../../../utils/findCharacterRemovalCountToFitChatHistoryInContext.js";
-import {truncateTextAndRoundToWords} from "../../../../utils/truncateTextAndRoundToWords.js";
+import {truncateLlamaTextAndRoundToWords, truncateTextAndRoundToWords} from "../../../../utils/truncateTextAndRoundToWords.js";
 import {ChatWrapper} from "../../../../ChatWrapper.js";
+import {LlamaText} from "../../../../utils/LlamaText.js";
 
 export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({
     chatHistory,
@@ -29,7 +30,7 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
         initialCharactersRemovalCount,
         tokenizer,
         chatWrapper,
-        compressChatHistory({chatHistory, charactersToRemove}) {
+        compressChatHistory({chatHistory, charactersToRemove, estimatedCharactersPerToken}) {
             const res = chatHistory.map(item => structuredClone(item));
             let charactersLeftToRemove = charactersToRemove;
 
@@ -46,16 +47,19 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                         if (typeof item === "string" || item.type !== "functionCall")
                             continue;
 
-                        const originalRawLength = item.raw?.length;
-
-                        if (originalRawLength == null)
+                        if (item.rawCall == null)
                             continue;
 
-                        const newRawText = chatWrapper.generateFunctionCallAndResult(item.name, item.params, item.result);
+                        const originalRawCallTokensLength = LlamaText.fromJSON(item.rawCall).tokenize(tokenizer, "trimLeadingSpace").length;
+
+                        const newRawCallText = chatWrapper.generateFunctionCall(item.name, item.params);
+                        const newRawCallTextTokensLength = newRawCallText.tokenize(tokenizer, "trimLeadingSpace").length;
 
-                        if (newRawText.length < originalRawLength) {
-                            item.raw = newRawText;
-                            charactersLeftToRemove -= originalRawLength - newRawText.length;
+                        if (newRawCallTextTokensLength < originalRawCallTokensLength) {
+                            item.rawCall = newRawCallText.toJSON();
+                            charactersLeftToRemove -= (
+                                (originalRawCallTokensLength - newRawCallTextTokensLength) * estimatedCharactersPerToken
+                            );
                         }
                     }
                 }
@@ -72,15 +76,20 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                         break; // keep the first system message
 
                     if (historyItem.type === "user" || historyItem.type === "system") {
-                        const newText = truncateTextAndRoundToWords(historyItem.text, charactersLeftToRemove);
+                        const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove);
+                        const newTextString = newText.toString();
+                        const historyItemString = LlamaText.fromJSON(historyItem.text).toString();
 
-                        if (newText === "") {
+                        if (newText.values.length === 0) {
                             res.splice(i, 1);
                             i++;
-                            charactersLeftToRemove -= historyItem.text.length;
-                        } else if (newText.length < historyItem.text.length) {
-                            charactersLeftToRemove -= historyItem.text.length - newText.length;
-                            historyItem.text = newText;
+                            charactersLeftToRemove -= historyItemString.length;
+                        } else if (newTextString.length < historyItemString.length) {
+                            charactersLeftToRemove -= historyItemString.length - newTextString.length;
+                            if (historyItem.type === "user")
+                                historyItem.text = newText.toString();
+                            else
+                                historyItem.text = newText.toJSON();
                         }
                     } else {
                         void (historyItem satisfies never);
@@ -117,8 +126,10 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                         } else if (item.type === "functionCall") {
                             historyItem.response.splice(t, 1);
                             t--;
-                            charactersLeftToRemove -= item.raw?.length ??
-                                chatWrapper.generateFunctionCallAndResult(item.name, item.params, item.result).length;
+
+                            const functionCallAndResultTokenUsage = chatWrapper.generateFunctionCallsAndResults([item], true)
+                                .tokenize(tokenizer, "trimLeadingSpace").length;
+                            charactersLeftToRemove -= functionCallAndResultTokenUsage * estimatedCharactersPerToken;
                         }
                     }
 
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 15657ea8..4edb0e19 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -8,7 +8,7 @@ import {LlamaGrammar} from "../LlamaGrammar.js";
 import {LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse} from "../LlamaChat/LlamaChat.js";
 import {EvaluationPriority} from "../LlamaContext/types.js";
 import {TokenBias} from "../TokenBias.js";
-import {LlamaText} from "../../utils/LlamaText.js";
+import {LlamaText, LlamaTextJSON} from "../../utils/LlamaText.js";
 import {wrapAbortSignal} from "../../utils/wrapAbortSignal.js";
 import {
     LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine
@@ -23,6 +23,16 @@ export type LlamaChatSessionOptions = {
 
     systemPrompt?: string,
 
+    /**
+     * Add the system prompt even on models that don't support a system prompt.
+     *
+     * Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
+     * but forcing the system prompt on unsupported models may not always work as expected.
+     *
+     * Use with caution.
+     */
+    forceAddSystemPrompt?: boolean,
+
     /** Automatically dispose the sequence when the session is disposed */
     autoDisposeSequence?: boolean,
 
@@ -127,11 +137,13 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
 } & ({
     grammar?: LlamaGrammar,
     functions?: never,
-    documentFunctionParams?: never
+    documentFunctionParams?: never,
+    maxParallelFunctionCalls?: never
 } | {
     grammar?: never,
     functions?: Functions | ChatSessionModelFunctions,
-    documentFunctionParams?: boolean
+    documentFunctionParams?: boolean,
+    maxParallelFunctionCalls?: number
 });
 
 export type LLamaChatCompletePromptOptions = {
@@ -247,6 +259,7 @@ export class LlamaChatSession {
         contextSequence,
         chatWrapper = "auto",
         systemPrompt = defaultChatSystemPrompt,
+        forceAddSystemPrompt = false,
         autoDisposeSequence = true,
         contextShift
     }: LlamaChatSessionOptions) {
@@ -257,10 +270,6 @@ export class LlamaChatSession {
             throw new DisposedError();
 
         this._contextShift = contextShift;
-        this._chatHistory = [{
-            type: "system",
-            text: systemPrompt
-        }];
 
         this._chat = new LlamaChat({
             autoDisposeSequence,
@@ -268,6 +277,15 @@ export class LlamaChatSession {
             contextSequence
         });
 
+        const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
+        if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || forceAddSystemPrompt)
+            this._chatHistory = [{
+                type: "system",
+                text: systemPrompt
+            }];
+        else
+            this._chatHistory = [];
+
         this._autoDisposeSequence = autoDisposeSequence;
 
         this._disposeAggregator.add(
@@ -326,6 +344,7 @@ export class LlamaChatSession {
     public async prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, {
         functions,
         documentFunctionParams,
+        maxParallelFunctionCalls,
         onToken,
         signal,
         stopOnAbortSignal = false,
@@ -344,6 +363,7 @@ export class LlamaChatSession {
             // this is a workaround to allow passing both `functions` and `grammar`
             functions: functions as undefined,
             documentFunctionParams: documentFunctionParams as undefined,
+            maxParallelFunctionCalls: maxParallelFunctionCalls as undefined,
 
             onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, grammar, trimWhitespaceSuffix, repeatPenalty,
             tokenBias, customStopTriggers
@@ -359,6 +379,7 @@ export class LlamaChatSession {
     public async promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, {
         functions,
         documentFunctionParams,
+        maxParallelFunctionCalls,
         onToken,
         signal,
         stopOnAbortSignal = false,
@@ -406,13 +427,15 @@ export class LlamaChatSession {
 
             // eslint-disable-next-line no-constant-condition
             while (true) {
+                const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
                 const {
-                    functionCall,
+                    functionCalls,
                     lastEvaluation: currentLastEvaluation,
                     metadata
                 } = await this._chat.generateResponse<Functions>(newChatHistory, {
                     functions,
                     documentFunctionParams,
+                    maxParallelFunctionCalls,
                     grammar: grammar as undefined, // this is a workaround to allow passing both `functions` and `grammar`
                     onToken,
                     signal,
@@ -438,43 +461,50 @@ export class LlamaChatSession {
                 });
                 this._ensureNotDisposed();
 
+                if (maxTokens != null)
+                    maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
+
                 lastEvaluation = currentLastEvaluation;
                 newChatHistory = lastEvaluation.cleanHistory;
 
-                if (functionCall != null) {
-                    const functionDefinition = functions?.[functionCall.functionName];
+                if (functionCalls != null && functionCalls.length > 0) {
+                    const functionCallAndResults = await Promise.all(
+                        functionCalls.map(async (functionCall) => {
+                            const functionDefinition = functions?.[functionCall.functionName];
+
+                            if (functionDefinition == null)
+                                throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
 
-                    if (functionDefinition == null)
-                        throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
+                            const functionCallResult = await functionDefinition.handler(functionCall.params);
+                            this._ensureNotDisposed();
 
-                    const functionCallResult = await functionDefinition.handler(functionCall.params);
+                            return [functionCall, functionDefinition, functionCallResult] as const;
+                        })
+                    );
                     this._ensureNotDisposed();
 
-                    newChatHistory = addFunctionCallToChatHistory({
-                        chatHistory: newChatHistory,
-                        functionName: functionCall.functionName,
-                        functionDescription: functionDefinition.description,
-                        callParams: functionCall.params,
-                        callResult: functionCallResult,
-                        raw: functionCall.raw + this._chat.chatWrapper.generateFunctionCallResult(
-                            functionCall.functionName,
-                            functionCall.params,
-                            functionCallResult
-                        )
-                    });
-
-                    newContextWindowChatHistory = addFunctionCallToChatHistory({
-                        chatHistory: lastEvaluation.contextWindow,
-                        functionName: functionCall.functionName,
-                        functionDescription: functionDefinition.description,
-                        callParams: functionCall.params,
-                        callResult: functionCallResult,
-                        raw: functionCall.raw + this._chat.chatWrapper.generateFunctionCallResult(
-                            functionCall.functionName,
-                            functionCall.params,
-                            functionCallResult
-                        )
-                    });
+                    newContextWindowChatHistory = lastEvaluation.contextWindow;
+
+                    for (const [functionCall, functionDefinition, functionCallResult] of functionCallAndResults) {
+                        newChatHistory = addFunctionCallToChatHistory({
+                            chatHistory: newChatHistory,
+                            functionName: functionCall.functionName,
+                            functionDescription: functionDefinition.description,
+                            callParams: functionCall.params,
+                            callResult: functionCallResult,
+                            rawCall: functionCall.raw
+                        });
+
+                        newContextWindowChatHistory = addFunctionCallToChatHistory({
+                            chatHistory: newContextWindowChatHistory,
+                            functionName: functionCall.functionName,
+                            functionDescription: functionDefinition.description,
+                            callParams: functionCall.params,
+                            callResult: functionCallResult,
+                            rawCall: functionCall.raw
+                        });
+                    }
+
                     lastEvaluation.cleanHistory = newChatHistory;
                     lastEvaluation.contextWindow = newContextWindowChatHistory;
 
@@ -685,14 +715,14 @@ function addFunctionCallToChatHistory({
     functionDescription,
     callParams,
     callResult,
-    raw
+    rawCall
 }: {
     chatHistory: ChatHistoryItem[],
     functionName: string,
     functionDescription?: string,
     callParams: any,
     callResult: any,
-    raw?: string
+    rawCall?: LlamaTextJSON
 }) {
     const newChatHistory = chatHistory.slice();
     if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
@@ -714,7 +744,7 @@ function addFunctionCallToChatHistory({
         description: functionDescription,
         params: callParams,
         result: callResult,
-        raw
+        rawCall
     });
 
     return newChatHistory;
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index f4af5f40..59be21df 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -651,10 +651,7 @@ export class LlamaCompletion {
                     );
                     pendingTokens.push(...queuedTokensBeforeStopTrigger);
 
-                    const [firstRemainingGenerationAfterStop] = triggeredStops
-                        .map((stopTrigger) => stopTrigger.remainingGenerations)
-                        .filter((remainingGenerations) => remainingGenerations.length > 0)
-                        .flat(1);
+                    const firstRemainingGenerationAfterStop = StopGenerationDetector.getFirstRemainingGenerationAfterStop(triggeredStops);
 
                     if (pendingTokens.length > 0)
                         onToken?.(pendingTokens.slice());
diff --git a/src/index.ts b/src/index.ts
index 35a8bbae..bbc2b87f 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -66,7 +66,7 @@ import {
     type ChatHistoryItem, type ChatModelFunctionCall, type ChatModelFunctions, type ChatModelResponse,
     type ChatSessionModelFunction, type ChatSessionModelFunctions, type ChatSystemMessage, type ChatUserMessage,
     type Token, type Tokenizer, type Detokenizer, isChatModelResponseFunctionCall, type LLamaContextualRepeatPenalty,
-    type ChatWrapperSettings
+    type ChatWrapperSettings, type ChatWrapperGenerateContextStateOptions, type ChatWrapperGeneratedContextState
 } from "./types.js";
 import {
     type GbnfJsonArraySchema, type GbnfJsonBasicSchema, type GbnfJsonConstSchema, type GbnfJsonEnumSchema, type GbnfJsonObjectSchema,
@@ -143,6 +143,8 @@ export {
     DisposedError,
     ChatWrapper,
     type ChatWrapperSettings,
+    type ChatWrapperGenerateContextStateOptions,
+    type ChatWrapperGeneratedContextState,
     EmptyChatWrapper,
     Llama3ChatWrapper,
     Llama2ChatWrapper,
diff --git a/src/types.ts b/src/types.ts
index 48b5822b..306b6250 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,5 +1,5 @@
 import {GbnfJsonSchema, GbnfJsonSchemaToType} from "./utils/gbnfJson/types.js";
-import {BuiltinSpecialTokenValue} from "./utils/LlamaText.js";
+import {LlamaText, BuiltinSpecialTokenValue, LlamaTextJSON} from "./utils/LlamaText.js";
 
 export type Token = number & {
     __token: never
@@ -18,25 +18,90 @@ export type Tokenizer = {
 
 
 export type ChatWrapperSettings = {
+    readonly supportsSystemMessages: boolean,
     readonly functions: {
         readonly call: {
             readonly optionalPrefixSpace: boolean,
-            readonly prefix: string,
-            readonly paramsPrefix: string,
-            readonly suffix: string
+            readonly prefix: string | LlamaText,
+            readonly paramsPrefix: string | LlamaText,
+            readonly suffix: string | LlamaText
         },
+
         readonly result: {
-            readonly prefix: string,
-            readonly suffix: string
+            /**
+             * Supported template parameters:
+             * - `{{functionName}}`
+             * - `{{functionParams}}`
+             *
+             * Template parameters can only appear in a string or a string in a `LlamaText`.
+             *
+             * Template parameters inside a `SpecialTokensText` inside a `LlamaText` won't be replaced.
+             *
+             * Example of supported values:
+             * - `"text{{functionName}}text"`
+             * - `LlamaText(["text{{functionName}}text"])`
+             *
+             * Example of unsupported values:
+             * - `LlamaText([new SpecialTokensText("text{{functionName}}text")])`
+             */
+            readonly prefix: string | LlamaText,
+
+            /**
+             * Supported template parameters:
+             * - `{{functionName}}`
+             * - `{{functionParams}}`
+             *
+             * Template parameters can only appear in a string or a string in a `LlamaText`.
+             *
+             * Template parameters inside a `SpecialTokensText` inside a `LlamaText` won't be replaced.
+             *
+             * Example of **supported** values:
+             * - `"text{{functionName}}text"`
+             * - `LlamaText(["text{{functionName}}text"])`
+             *
+             * Example of **unsupported** values:
+             * - `LlamaText([new SpecialTokensText("text{{functionName}}text")])`
+             */
+            readonly suffix: string | LlamaText
+        },
+
+        /** If this field is present, parallel function calling is supported */
+        readonly parallelism?: {
+            readonly call: {
+                readonly sectionPrefix: string | LlamaText,
+                readonly betweenCalls?: string | LlamaText,
+                readonly sectionSuffix?: string | LlamaText
+            },
+            readonly result?: {
+                readonly sectionPrefix?: string | LlamaText,
+                readonly betweenResults?: string | LlamaText,
+                readonly sectionSuffix?: string | LlamaText
+            }
         }
     }
 };
 
+export type ChatWrapperGenerateContextStateOptions = {
+    chatHistory: readonly ChatHistoryItem[],
+    availableFunctions?: ChatModelFunctions,
+    documentFunctionParams?: boolean
+};
+
+export type ChatWrapperGeneratedContextState = {
+    contextText: LlamaText,
+    stopGenerationTriggers: LlamaText[],
+    ignoreStartText?: LlamaText[],
+    functionCall?: {
+        initiallyEngaged: boolean,
+        disengageInitiallyEngaged: LlamaText[]
+    }
+};
+
 export type ChatHistoryItem = ChatSystemMessage | ChatUserMessage | ChatModelResponse;
 
 export type ChatSystemMessage = {
     type: "system",
-    text: string
+    text: string | LlamaTextJSON
 };
 export type ChatUserMessage = {
     type: "user",
@@ -52,7 +117,7 @@ export type ChatModelFunctionCall = {
     description?: string,
     params: any,
     result: any,
-    raw?: string
+    rawCall?: LlamaTextJSON
 };
 
 export type ChatModelFunctions = {
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index 93473c22..467dbca1 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -1,10 +1,10 @@
-import {Token, Tokenizer} from "../types.js";
 import type {InspectOptions, inspect as InspectFunction} from "node:util";
+import type {Token, Tokenizer} from "../types.js";
 
 export type LlamaTextValue = string | SpecialTokensText | SpecialToken;
 export type LlamaTextInputValue = LlamaTextValue | LlamaText | number | boolean | readonly LlamaTextInputValue[];
 
-export type LlamaTextJSON = Array<LlamaTextJSONValue>;
+export type LlamaTextJSON = string | LlamaTextJSONValue[];
 export type LlamaTextJSONValue = string | LlamaTextSpecialTokensTextJSON | LlamaTextSpecialTokenJSON;
 export type LlamaTextSpecialTokensTextJSON = {type: "specialTokensText", value: string};
 export type LlamaTextSpecialTokenJSON = {type: "specialToken", value: string};
@@ -24,12 +24,26 @@ class LlamaText {
         return new LlamaTextConstructor([...this.values, value]);
     }
 
-    public mapValues(mapper: (value: LlamaTextValue) => LlamaTextInputValue) {
+    public mapValues(
+        mapper: (
+            this: readonly LlamaTextValue[],
+            value: LlamaTextValue,
+            index: number,
+            values: readonly LlamaTextValue[]
+        ) => LlamaTextInputValue
+    ) {
         return new LlamaTextConstructor(
             this.values.map(mapper)
         );
     }
 
+    /**
+     * Joins the values with the given separator.
+     *
+     * Note that the values are squashed when they are loaded into the `LlamaText`, so the separator is not added between adjacent strings.
+     *
+     * To add the separator on values before squashing them, use `LlamaText.joinValues` instead.
+     */
     public joinValues(separator: LlamaText | LlamaTextValue) {
         const newValues: LlamaTextValue[] = [];
 
@@ -61,6 +75,11 @@ class LlamaText {
     }
 
     public toJSON(): LlamaTextJSON {
+        if (this.values.length === 1 && typeof this.values[0] === "string")
+            return this.values[0];
+        else if (this.values.length === 0)
+            return "";
+
         return this.values.map((value) => {
             if (value instanceof SpecialToken)
                 return value.toJSON() satisfies LlamaTextJSONValue;
@@ -238,6 +257,9 @@ class LlamaText {
     public static fromJSON(json: LlamaTextJSON): LlamaText {
         // assigned to `LlamaTextConstructor` manually to expose this static method
 
+        if (typeof json === "string")
+            return new LlamaTextConstructor(json);
+
         return new LlamaTextConstructor(
             json.map((value) => {
                 if (typeof value === "string")
@@ -270,6 +292,95 @@ class LlamaText {
 
         return true;
     }
+
+    /**
+     * Attempt to convert tokens to a `LlamaText` while preserving special tokens.
+     *
+     * Non-standard special tokens that don't have a text representation are ignored.
+     */
+    public static fromTokens(tokenizer: Tokenizer, tokens: Token[]): LlamaText {
+        // assigned to `LlamaTextConstructor` manually to expose this static method
+
+        const res: (string | SpecialToken | SpecialTokensText)[] = [];
+        const pendingTokens: Token[] = [];
+
+        const addPendingTokens = () => {
+            if (pendingTokens.length === 0)
+                return;
+
+            res.push(tokenizer.detokenize(pendingTokens, false));
+            pendingTokens.length = 0;
+        };
+
+        const builtinTokens = SpecialToken.getTokenToValueMap(tokenizer);
+
+        for (const token of tokens) {
+            if (token == null)
+                continue;
+
+            const builtinTokenValue = builtinTokens.get(token);
+            if (builtinTokenValue != null) {
+                addPendingTokens();
+                res.push(new SpecialToken(builtinTokenValue));
+                continue;
+            }
+
+            const regularText = tokenizer.detokenize([token], false);
+            const retokenizedRegularText = tokenizer(regularText, false, "trimLeadingSpace");
+            if (retokenizedRegularText.length === 1 && retokenizedRegularText[0] === token) {
+                pendingTokens.push(token);
+                continue;
+            }
+
+            const specialText = tokenizer.detokenize([token], true);
+            const retokenizedSpecialText = tokenizer(specialText, true, "trimLeadingSpace");
+            if (retokenizedSpecialText.length === 1 && retokenizedSpecialText[0] === token) {
+                addPendingTokens();
+                res.push(new SpecialTokensText(specialText));
+                continue;
+            }
+
+            pendingTokens.push(token);
+        }
+
+        addPendingTokens();
+
+        return new LlamaTextConstructor(res);
+    }
+
+    /**
+     * Join values with the given separator before squashing adjacent strings inside the values
+     */
+    public static joinValues(separator: LlamaText | string, values: readonly LlamaTextInputValue[]): LlamaText {
+        // assigned to `LlamaTextConstructor` manually to expose this static method
+
+        const newValues: (LlamaTextInputValue | LlamaText)[] = [];
+
+        for (let i = 0; i < values.length; i++) {
+            const value = values[i];
+
+            if (i !== 0)
+                newValues.push(separator);
+
+            newValues.push(value);
+        }
+
+        return new LlamaTextConstructor(newValues);
+    }
+
+    public static isLlamaText(value: unknown): value is LlamaText {
+        // assigned to `LlamaTextConstructor` manually to expose this static method
+
+        if (value instanceof LlamaTextConstructor || value instanceof LlamaText)
+            return true;
+
+        try {
+            // detect a `LlamaText` created from a different module import
+            return value != null && Object.getPrototypeOf(value as LlamaText)?._type === "LlamaText";
+        } catch (err) {
+            return false;
+        }
+    }
 }
 Object.defineProperty(LlamaText.prototype, "_type", {
     enumerable: false,
@@ -296,6 +407,9 @@ const LlamaTextConstructor: LlamaTextConstructor = function LlamaText(this: Llam
 (LlamaTextConstructor as (() => any)).prototype.constructor = LlamaTextConstructor;
 LlamaTextConstructor.fromJSON = LlamaText.fromJSON;
 LlamaTextConstructor.compare = LlamaText.compare;
+LlamaTextConstructor.fromTokens = LlamaText.fromTokens;
+LlamaTextConstructor.joinValues = LlamaText.joinValues;
+LlamaTextConstructor.isLlamaText = LlamaText.isLlamaText;
 
 const _LlamaText = LlamaTextConstructor;
 type _LlamaText = LlamaText;
@@ -430,14 +544,27 @@ export class SpecialToken {
     public static isSpecialTokenJSON(value: LlamaTextJSONValue): value is LlamaTextSpecialTokenJSON {
         return value != null && typeof value === "object" && value.type === "specialToken";
     }
+
+    public static getTokenToValueMap(tokenizer: Tokenizer): ReadonlyMap<Token | undefined, BuiltinSpecialTokenValue> {
+        const supportedValues = [
+            "BOS", "EOS", "NL", "EOT"
+        ] as const satisfies BuiltinSpecialTokenValue[];
+        void (0 as any as BuiltinSpecialTokenValue satisfies typeof supportedValues[number]);
+
+        const res = new Map<Token | undefined, BuiltinSpecialTokenValue>(
+            supportedValues.map(
+                (value) => ([tokenizer(value, "builtin")[0], value])
+            )
+        );
+
+        res.delete(undefined);
+
+        return res;
+    }
 }
 
 export function isLlamaText(value: unknown): value is LlamaText {
-    if (value instanceof LlamaTextConstructor || value instanceof LlamaText)
-        return true;
-
-    // detect a `LlamaText` created from a different module import
-    return value != null && Object.getPrototypeOf(value as LlamaText)?._type === "LlamaText";
+    return LlamaText.isLlamaText(value);
 }
 
 export function tokenizeText(text: string | LlamaText, tokenizer: Tokenizer) {
diff --git a/src/utils/StopGenerationDetector.ts b/src/utils/StopGenerationDetector.ts
index 19793e8c..aff70b9f 100644
--- a/src/utils/StopGenerationDetector.ts
+++ b/src/utils/StopGenerationDetector.ts
@@ -1,4 +1,4 @@
-import {Token, Tokenizer} from "../types.js";
+import {Detokenizer, Token, Tokenizer} from "../types.js";
 import {SpecialToken, isLlamaText, LlamaText, SpecialTokensText} from "./LlamaText.js";
 import {QueuedTokenRelease, QueuedTokenReleaseLock} from "./TokenStreamRegulator.js";
 
@@ -12,8 +12,18 @@ export class StopGenerationDetector<T extends string = string> {
         queuedTokenReleaseLocks: Set<QueuedTokenReleaseLock>
     }>();
 
-    public recordGeneration({text, tokens, queuedTokenRelease, startNewChecks = true}: {
-        text: string, tokens: Token[], queuedTokenRelease?: QueuedTokenRelease, startNewChecks?: boolean
+    public recordGeneration({
+        text,
+        tokens,
+        queuedTokenRelease,
+        startNewChecks = true,
+        triggerMustStartWithGeneration = false
+    }: {
+        text: string,
+        tokens: Token[],
+        queuedTokenRelease?: QueuedTokenRelease,
+        startNewChecks?: boolean,
+        triggerMustStartWithGeneration?: boolean
     }) {
         const currentActiveChecks = this._activeChecks;
         this._activeChecks = new Set();
@@ -42,7 +52,7 @@ export class StopGenerationDetector<T extends string = string> {
         if (!startNewChecks)
             return;
 
-        for (let i = 0; i < text.length; i++) {
+        for (let i = 0; i < text.length && (!triggerMustStartWithGeneration || i === 0); i++) {
             const char = text[i];
             const currentPart = this._stopTriggers.get(char);
 
@@ -58,7 +68,7 @@ export class StopGenerationDetector<T extends string = string> {
             textCheck.queuedTokenReleaseLock?.dispose();
         }
 
-        for (let i = 0; i < tokens.length; i++) {
+        for (let i = 0; i < tokens.length && (!triggerMustStartWithGeneration || i === 0); i++) {
             const token = tokens[i];
             const currentPart = this._stopTriggers.get(token);
 
@@ -127,18 +137,13 @@ export class StopGenerationDetector<T extends string = string> {
 
     /** Gets the stops that have been found and triggered. */
     public getTriggeredStops() {
-        const res: Array<{
-            stopTrigger: StopGenerationTrigger,
-            events: T[],
-            remainingGenerations: (string | Token[])[],
-            queuedTokenReleaseLocks: QueuedTokenReleaseLock[]
-        }> = [];
+        const res: TriggeredStop<T>[] = [];
 
         for (const [triggerPart, triggeredStop] of this._triggeredStops.entries()) {
             res.push({
                 stopTrigger: triggerPart.completesTrigger!,
                 events: Array.from(triggerPart.completeEvents ?? new Set()),
-                remainingGenerations: Array.from(triggeredStop.remainingGenerations),
+                remainingGeneration: Array.from(triggeredStop.remainingGenerations),
                 queuedTokenReleaseLocks: Array.from(triggeredStop.queuedTokenReleaseLocks)
             });
         }
@@ -162,6 +167,51 @@ export class StopGenerationDetector<T extends string = string> {
         this._activeChecks.clear();
     }
 
+    public get hasTriggers() {
+        return this._stopTriggers.size > 0;
+    }
+
+    /**
+     * For a given generation, get the number of possibilities that would be disregarded if the generation is recorded.
+     *
+     * Calling this function does not change the state of the detector.
+     */
+    public getDisregardedPossibilitiesCountForAGeneration({
+        text, tokens, startNewChecks
+    }: {
+        text: string, tokens: Token[],
+
+        /** Setting this to `true` implies that `triggerMustStartWithGeneration` is also `true` */
+        startNewChecks: boolean
+    }) {
+        let res = 0;
+
+        for (const check of this._activeChecks) {
+            const disregardedTextPossibilities = this._getCountOfPossibleTriggersToBeDisregarded(check.currentPart, text);
+            const disregardedTokenPossibilities = this._getCountOfPossibleTriggersToBeDisregarded(check.currentPart, tokens);
+
+            res += Math.min(disregardedTextPossibilities, disregardedTokenPossibilities);
+        }
+
+        if (startNewChecks) {
+            const disregardedTextPossibilities = text.length > 0
+                ? this._getCountOfPossibleTriggersToBeDisregarded(this._stopTriggers.get(text[0]), text.slice(1))
+                : null;
+            const disregardedTokenPossibilities = tokens.length > 0
+                ? this._getCountOfPossibleTriggersToBeDisregarded(this._stopTriggers.get(tokens[0]), tokens.slice(1))
+                : null;
+
+            if (disregardedTextPossibilities != null && disregardedTokenPossibilities != null)
+                res += Math.min(disregardedTextPossibilities, disregardedTokenPossibilities);
+            else if (disregardedTextPossibilities != null)
+                res += disregardedTextPossibilities;
+            else if (disregardedTokenPossibilities != null)
+                res += disregardedTokenPossibilities;
+        }
+
+        return res;
+    }
+
     /** @internal */
     private _addFoundStop(
         part: TriggerPart<T>,
@@ -183,6 +233,35 @@ export class StopGenerationDetector<T extends string = string> {
             triggeredStop.queuedTokenReleaseLocks.add(queuedTokenReleaseLock);
     }
 
+    /** @internal */
+    private _getCountOfPossibleTriggersToBeDisregarded(initialPart: TriggerPart<T> | undefined, value: string | Token[]) {
+        if (initialPart == null)
+            return 0;
+
+        let part: TriggerPart<T> | undefined = initialPart;
+        let res = 0;
+
+        for (let i = 0; i < value.length && part != null; i++) {
+            const item = value[i];
+
+            if (part.next == null)
+                return res + 1;
+
+            if (part.next.has(item)) {
+                res += part.next.size - 1;
+                part = part.next.get(item);
+                continue;
+            }
+
+            return res + part.next.size;
+        }
+
+        if (part == null || part.next == null)
+            return res + 1;
+
+        return res;
+    }
+
     /** @internal */
     private _checkTriggerPart(check: TriggerCheck<T> | undefined, value: string | Token[]) {
         if (check == null)
@@ -258,6 +337,29 @@ export class StopGenerationDetector<T extends string = string> {
                 .flat(1)
         );
     }
+
+    public static getFirstRemainingGenerationAfterStop(triggeredStops: TriggeredStop[]): string | Token[] | undefined {
+        const [firstRemainingGenerationAfterStop] = triggeredStops
+            .map((stopTrigger) => stopTrigger.remainingGeneration)
+            .filter((remainingGenerations) => remainingGenerations.length > 0)
+            .flat(1);
+
+        return firstRemainingGenerationAfterStop;
+    }
+
+    public static detokenizeRemainingGeneration(
+        remainingGeneration: string | Token[] | undefined,
+        detokenizer: Detokenizer,
+        specialTokens: boolean = false
+    ) {
+        if (remainingGeneration == null || remainingGeneration.length === 0)
+            return "";
+
+        if (typeof remainingGeneration === "string")
+            return remainingGeneration;
+
+        return detokenizer(remainingGeneration, specialTokens);
+    }
 }
 
 function simplifyStopTrigger(stopTrigger: StopGenerationTrigger): StopGenerationTrigger {
@@ -294,3 +396,10 @@ type TriggerPart<T extends string = string> = {
     completesTrigger?: StopGenerationTrigger,
     completeEvents?: Set<T>
 };
+
+export type TriggeredStop<T extends string = string> = {
+    stopTrigger: StopGenerationTrigger,
+    events: T[],
+    remainingGeneration: (string | Token[])[],
+    queuedTokenReleaseLocks: QueuedTokenReleaseLock[]
+};
diff --git a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
index 96656f72..c0bd096c 100644
--- a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
+++ b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
@@ -12,7 +12,7 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
     maxDecompressionAttempts = 2
 }: {
     compressChatHistory(options: {
-        chatHistory: readonly ChatHistoryItem[], charactersToRemove: number
+        chatHistory: readonly ChatHistoryItem[], charactersToRemove: number, estimatedCharactersPerToken: number
     }): ChatHistoryItem[] | Promise<ChatHistoryItem[]>,
     chatHistory: ChatHistoryItem[],
     tokensCountToFit: number,
@@ -25,9 +25,11 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
     removedCharactersCount: number,
     compressedChatHistory: ChatHistoryItem[]
 }> {
+    let currentEstimatedCharactersPerToken = estimatedCharactersPerToken;
+
     function getTokensCountForChatHistory(chatHistory: readonly ChatHistoryItem[]) {
-        const {contextText} = chatWrapper.generateContextText(chatHistory);
-        return contextText.tokenize(tokenizer).length;
+        const {contextText} = chatWrapper.generateContextState({chatHistory});
+        return contextText.tokenize(tokenizer, "trimLeadingSpace").length;
     }
 
     async function getResultForCharacterRemovalCount(characterRemovalCount: number) {
@@ -40,7 +42,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
 
         const compressedHistory = await compressChatHistory({
             chatHistory,
-            charactersToRemove: characterRemovalCount
+            charactersToRemove: characterRemovalCount,
+            estimatedCharactersPerToken: currentEstimatedCharactersPerToken
         });
 
         return {
@@ -52,7 +55,6 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
 
     let latestCompressionAttempt = await getResultForCharacterRemovalCount(initialCharactersRemovalCount);
     const firstCompressionAttempt = latestCompressionAttempt;
-    let currentEstimatedCharactersPerToken = estimatedCharactersPerToken;
 
     if (latestCompressionAttempt.tokensCount === tokensCountToFit ||
         (latestCompressionAttempt.tokensCount < tokensCountToFit && latestCompressionAttempt.characterRemovalCount === 0)
diff --git a/src/utils/truncateTextAndRoundToWords.ts b/src/utils/truncateTextAndRoundToWords.ts
index 36030add..dd233ca0 100644
--- a/src/utils/truncateTextAndRoundToWords.ts
+++ b/src/utils/truncateTextAndRoundToWords.ts
@@ -1,3 +1,5 @@
+import {LlamaText, SpecialToken, SpecialTokensText} from "./LlamaText.js";
+
 const truncatePrefix = "...";
 
 /**
@@ -33,3 +35,36 @@ export function truncateTextAndRoundToWords(text: string, truncateStartIndex: nu
 
     return truncatePrefix + res.slice(truncatePrefix.length);
 }
+
+export function truncateLlamaTextAndRoundToWords(llamaText: LlamaText, truncateStartIndex: number, maxRound: number = 6): LlamaText {
+    if (truncateStartIndex <= 0)
+        return llamaText;
+
+    for (let i = 0; i < llamaText.values.length; i++) {
+        const value = llamaText.values[i];
+        if (typeof value === "string") {
+            if (value.length > truncateStartIndex) {
+                return LlamaText([
+                    truncateTextAndRoundToWords(value, truncateStartIndex, maxRound),
+                    ...llamaText.values.slice(i + 1)
+                ]);
+            }
+
+            truncateStartIndex -= value.length;
+        } else if (value instanceof SpecialToken) {
+            truncateStartIndex--;
+            if (truncateStartIndex <= 0)
+                return LlamaText(llamaText.values.slice(i + 1));
+        } else {
+            void (value satisfies SpecialTokensText);
+
+            // SpecialTokensText shouldn't be truncated
+            if (value.value.length > truncateStartIndex)
+                return LlamaText(llamaText.values.slice(i + 1));
+
+            truncateStartIndex -= value.value.length;
+        }
+    }
+
+    return LlamaText([]);
+}
diff --git a/test/standalone/chatWrappers/ChatMLChatPromptWrapper.test.ts b/test/standalone/chatWrappers/ChatMLChatWrapper.test.ts
similarity index 92%
rename from test/standalone/chatWrappers/ChatMLChatPromptWrapper.test.ts
rename to test/standalone/chatWrappers/ChatMLChatWrapper.test.ts
index 67bfa74a..76df46f6 100644
--- a/test/standalone/chatWrappers/ChatMLChatPromptWrapper.test.ts
+++ b/test/standalone/chatWrappers/ChatMLChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("ChatMLChatWrapper", () => {
 
     test("should generate valid context text", () => {
         const chatWrapper = new ChatMLChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -66,7 +66,7 @@ describe("ChatMLChatWrapper", () => {
         `);
 
         const chatWrapper2 = new ChatMLChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -113,14 +113,16 @@ describe("ChatMLChatWrapper", () => {
         `);
 
         const chatWrapper3 = new ChatMLChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/FalconChatPromptWrapper.test.ts b/test/standalone/chatWrappers/FalconChatWrapper.test.ts
similarity index 89%
rename from test/standalone/chatWrappers/FalconChatPromptWrapper.test.ts
rename to test/standalone/chatWrappers/FalconChatWrapper.test.ts
index d22d74e7..9fd831f3 100644
--- a/test/standalone/chatWrappers/FalconChatPromptWrapper.test.ts
+++ b/test/standalone/chatWrappers/FalconChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("FalconChatWrapper", () => {
 
     test("should generate valid context text", () => {
         const chatWrapper = new FalconChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -51,7 +51,7 @@ describe("FalconChatWrapper", () => {
         `);
 
         const chatWrapper2 = new FalconChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -73,14 +73,16 @@ describe("FalconChatWrapper", () => {
         `);
 
         const chatWrapper3 = new FalconChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts b/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
new file mode 100644
index 00000000..8bb313da
--- /dev/null
+++ b/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
@@ -0,0 +1,381 @@
+import {describe, expect, test} from "vitest";
+import {ChatHistoryItem, defineChatSessionFunction, FunctionaryChatWrapper} from "../../../src/index.js";
+import {defaultChatSystemPrompt} from "../../../src/config.js";
+
+
+describe("FunctionaryChatWrapper", () => {
+    const conversationHistory: ChatHistoryItem[] = [{
+        type: "system",
+        text: defaultChatSystemPrompt
+    }, {
+        type: "user",
+        text: "Hi there!"
+    }, {
+        type: "model",
+        response: ["Hello!"]
+    }, {
+        type: "user",
+        text: "How are you?"
+    }, {
+        type: "model",
+        response: ["I'm good, how are you?"]
+    }];
+
+    const functions = {
+        getRandomNumber: defineChatSessionFunction({
+            description: "Get a random number",
+            params: {
+                type: "object",
+                properties: {
+                    min: {
+                        type: "number"
+                    },
+                    max: {
+                        type: "number"
+                    }
+                }
+            },
+            async handler(params) {
+                return Math.floor(Math.random() * (params.max - params.min + 1) + params.min);
+            }
+        })
+    };
+    const conversationHistory2: ChatHistoryItem[] = [{
+        type: "system",
+        text: defaultChatSystemPrompt
+    }, {
+        type: "user",
+        text: "Hi there!"
+    }, {
+        type: "model",
+        response: ["Hello!"]
+    }, {
+        type: "user",
+        text: "Role a dice twice and tell me the total result"
+    }, {
+        type: "model",
+        response: [
+            {
+                type: "functionCall",
+                name: "getRandomNumber",
+                description: "Get a random number",
+                params: {
+                    min: 1,
+                    max: 6
+                },
+                result: 3
+            },
+            {
+                type: "functionCall",
+                name: "getRandomNumber",
+                description: "Get a random number",
+                params: {
+                    min: 1,
+                    max: 6
+                },
+                result: 4
+            },
+            "The total result of rolling the dice twice is 3 + 4 = 7."
+        ]
+    }];
+
+    test("should generate valid context text", () => {
+        const chatWrapper = new FunctionaryChatWrapper();
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
+
+        expect(contextText.values).toMatchInlineSnapshot(`
+          [
+            {
+              "type": "specialToken",
+              "value": "BOS",
+            },
+            {
+              "type": "specialTokensText",
+              "value": "<|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hi there!",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hello!",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "How are you?",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "I'm good, how are you?",
+          ]
+        `);
+
+        const chatWrapper2 = new FunctionaryChatWrapper();
+        const {contextText: contextText2} = chatWrapper2.generateContextState({
+            chatHistory: conversationHistory2,
+            availableFunctions: functions
+        });
+
+        expect(contextText2.values).toMatchInlineSnapshot(`
+          [
+            {
+              "type": "specialToken",
+              "value": "BOS",
+            },
+            {
+              "type": "specialTokensText",
+              "value": "<|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "// Supported function definitions that should be called when necessary.
+          namespace functions {
+
+          // Get a random number
+          type getRandomNumber = (_: {min: number, max: number}) => any;
+
+          } // namespace functions",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "The assistant calls functions with appropriate input when necessary. The assistant writes <|stop|> when finished answering.",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hi there!",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hello!",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "Role a dice twice and tell me the total result",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>",
+            },
+            "getRandomNumber",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|content|>",
+            },
+            "{"min":1,"max":6}",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>",
+            },
+            "getRandomNumber",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|content|>",
+            },
+            "{"min":1,"max":6}",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>",
+            },
+            "getRandomNumber",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|recipient|>all
+          <|content|>",
+            },
+            "3",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>",
+            },
+            "getRandomNumber",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|recipient|>all
+          <|content|>",
+            },
+            "4",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "The total result of rolling the dice twice is 3 + 4 = 7.",
+          ]
+        `);
+
+        const chatWrapper3 = new FunctionaryChatWrapper();
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
+
+        expect(contextText3.values).toMatchInlineSnapshot(`
+          [
+            {
+              "type": "specialToken",
+              "value": "BOS",
+            },
+            {
+              "type": "specialTokensText",
+              "value": "<|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hi there!",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hello!",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "How are you?",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "I'm good, how are you?",
+          ]
+        `);
+
+        expect(contextText3WithOpenModelResponse.values).toMatchInlineSnapshot(`
+          [
+            {
+              "type": "specialToken",
+              "value": "BOS",
+            },
+            {
+              "type": "specialTokensText",
+              "value": "<|from|>system
+          <|recipient|>all
+          <|content|>",
+            },
+            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hi there!",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "Hello!",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>user
+          <|recipient|>all
+          <|content|>",
+            },
+            "How are you?",
+            {
+              "type": "specialTokensText",
+              "value": "
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+            "I'm good, how are you?",
+            {
+              "type": "specialTokensText",
+              "value": "<|stop|>
+          <|from|>assistant
+          <|recipient|>all
+          <|content|>",
+            },
+          ]
+        `);
+    });
+});
diff --git a/test/standalone/chatWrappers/GemmaChatWrapper.test.ts b/test/standalone/chatWrappers/GemmaChatWrapper.test.ts
index badb68f4..e53a7151 100644
--- a/test/standalone/chatWrappers/GemmaChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/GemmaChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("GemmaChatWrapper", () => {
 
     test("should generate valid context text", () => {
         const chatWrapper = new GemmaChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -58,7 +58,7 @@ describe("GemmaChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText2} = chatWrapper.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -97,14 +97,16 @@ describe("GemmaChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText3} = chatWrapper.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/GeneralChatPromptWrapper.test.ts b/test/standalone/chatWrappers/GeneralChatWrapper.test.ts
similarity index 90%
rename from test/standalone/chatWrappers/GeneralChatPromptWrapper.test.ts
rename to test/standalone/chatWrappers/GeneralChatWrapper.test.ts
index d4f7904b..cc9583b7 100644
--- a/test/standalone/chatWrappers/GeneralChatPromptWrapper.test.ts
+++ b/test/standalone/chatWrappers/GeneralChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("GeneralChatWrapper", () => {
 
     test("should generate valid context text for default roles", () => {
         const chatWrapper = new GeneralChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -53,7 +53,7 @@ describe("GeneralChatWrapper", () => {
         `);
 
         const chatWrapper2 = new GeneralChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -79,14 +79,16 @@ describe("GeneralChatWrapper", () => {
         `);
 
         const chatWrapper3 = new GeneralChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
@@ -131,7 +133,7 @@ describe("GeneralChatWrapper", () => {
             userMessageTitle: "Instruction",
             modelResponseTitle: "Response"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -154,7 +156,7 @@ describe("GeneralChatWrapper", () => {
             userMessageTitle: "Instruction",
             modelResponseTitle: "Response"
         });
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/Llama2ChatPromptWrapper.test.ts b/test/standalone/chatWrappers/Llama2ChatWrapper.test.ts
similarity index 92%
rename from test/standalone/chatWrappers/Llama2ChatPromptWrapper.test.ts
rename to test/standalone/chatWrappers/Llama2ChatWrapper.test.ts
index c370a0a8..b871d647 100644
--- a/test/standalone/chatWrappers/Llama2ChatPromptWrapper.test.ts
+++ b/test/standalone/chatWrappers/Llama2ChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("Llama2ChatWrapper", () => {
 
     test("should generate valid context text", () => {
         const chatWrapper = new Llama2ChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -65,7 +65,7 @@ describe("Llama2ChatWrapper", () => {
         `);
 
         const chatWrapper2 = new Llama2ChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -115,14 +115,16 @@ describe("Llama2ChatWrapper", () => {
         `);
 
         const chatWrapper3 = new Llama2ChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/Llama3ChatPromptWrapper.test.ts b/test/standalone/chatWrappers/Llama3ChatWrapper.test.ts
similarity index 93%
rename from test/standalone/chatWrappers/Llama3ChatPromptWrapper.test.ts
rename to test/standalone/chatWrappers/Llama3ChatWrapper.test.ts
index 63a3881f..e25d76cc 100644
--- a/test/standalone/chatWrappers/Llama3ChatPromptWrapper.test.ts
+++ b/test/standalone/chatWrappers/Llama3ChatWrapper.test.ts
@@ -33,7 +33,7 @@ describe("Llama3ChatWrapper", () => {
 
     test("should generate valid context text", () => {
         const chatWrapper = new Llama3ChatWrapper();
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -75,7 +75,7 @@ describe("Llama3ChatWrapper", () => {
         `);
 
         const chatWrapper2 = new Llama3ChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper2.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -139,14 +139,16 @@ describe("Llama3ChatWrapper", () => {
         `);
 
         const chatWrapper3 = new Llama3ChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
diff --git a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
index 00555da2..ca5ebfac 100644
--- a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
@@ -130,7 +130,7 @@ describe("JinjaTemplateChatWrapper", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template2
         });
-        const {contextText, stopGenerationTriggers} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText, stopGenerationTriggers} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -185,7 +185,7 @@ describe("JinjaTemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText2} = chatWrapper.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -242,14 +242,16 @@ describe("JinjaTemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText3} = chatWrapper.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
@@ -319,7 +321,7 @@ describe("JinjaTemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText4} = chatWrapper.generateContextText(conversationHistory3);
+        const {contextText: contextText4} = chatWrapper.generateContextState({chatHistory: conversationHistory3});
 
         expect(contextText4.values).toMatchInlineSnapshot(`
           [
@@ -366,7 +368,7 @@ describe("JinjaTemplateChatWrapper", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template1
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -399,7 +401,7 @@ describe("JinjaTemplateChatWrapper", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template3
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -433,7 +435,7 @@ describe("JinjaTemplateChatWrapper", () => {
             template: template2,
             systemRoleName: "something1"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -467,7 +469,7 @@ describe("JinjaTemplateChatWrapper", () => {
             template: template2,
             joinAdjacentMessagesOfTheSameType: false
         });
-        const {contextText} = chatWrapper.generateContextText([conversationHistory[0], ...conversationHistory]);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: [conversationHistory[0], ...conversationHistory]});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -517,7 +519,8 @@ describe("JinjaTemplateChatWrapper", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template2
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistory,
             availableFunctions: exampleFunctions
         });
 
@@ -583,7 +586,8 @@ describe("JinjaTemplateChatWrapper", () => {
                 " [[result: {{functionCallResult}}]]"
             ]
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistoryWithFunctionCalls, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistoryWithFunctionCalls,
             availableFunctions: exampleFunctions
         });
 
@@ -626,8 +630,7 @@ describe("JinjaTemplateChatWrapper", () => {
               "type": "specialTokensText",
               "value": " [/INST] ",
             },
-            "Hello!
-          [[call: func2({"message":"Hello","feeling":"good","words":1})]] [[result: {"yes":true,"message":"ok"}]]",
+            "Hello![[call: func2({"message":"Hello","feeling":"good","words":1})]] [[result: {"yes":true,"message":"ok"}]]",
             {
               "type": "specialTokensText",
               "value": " ",
@@ -661,7 +664,8 @@ describe("JinjaTemplateChatWrapper", () => {
                 "\nFunction result: {{functionCallResult}}\n"
             ]
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistoryWithFunctionCalls, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistoryWithFunctionCalls,
             availableFunctions: exampleFunctions
         });
 
@@ -706,7 +710,6 @@ describe("JinjaTemplateChatWrapper", () => {
               "value": " [/INST] ",
             },
             "Hello!
-
           Call function: func2 with params {"message":"Hello","feeling":"good","words":1}.
           Function result: {"yes":true,"message":"ok"}
           ",
diff --git a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
index 4bb16ee0..f45c6500 100644
--- a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
@@ -101,7 +101,7 @@ describe("TemplateChatWrapper", () => {
             userRoleName: "user",
             systemRoleName: "system"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -126,7 +126,7 @@ describe("TemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText2} = chatWrapper.generateContextText(conversationHistory2);
+        const {contextText: contextText2} = chatWrapper.generateContextState({chatHistory: conversationHistory2});
 
         expect(contextText2.values).toMatchInlineSnapshot(`
           [
@@ -163,14 +163,16 @@ describe("TemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText3} = chatWrapper.generateContextText(conversationHistory);
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextText([
-            ...conversationHistory,
-            {
-                type: "model",
-                response: []
-            }
-        ]);
+        const {contextText: contextText3} = chatWrapper.generateContextState({chatHistory: conversationHistory});
+        const {contextText: contextText3WithOpenModelResponse} = chatWrapper.generateContextState({
+            chatHistory: [
+                ...conversationHistory,
+                {
+                    type: "model",
+                    response: []
+                }
+            ]
+        });
 
         expect(contextText3.values).toMatchInlineSnapshot(`
           [
@@ -220,7 +222,7 @@ describe("TemplateChatWrapper", () => {
           ]
         `);
 
-        const {contextText: contextText4} = chatWrapper.generateContextText(conversationHistory3);
+        const {contextText: contextText4} = chatWrapper.generateContextState({chatHistory: conversationHistory3});
 
         expect(contextText4.values).toMatchInlineSnapshot(`
           [
@@ -259,7 +261,7 @@ describe("TemplateChatWrapper", () => {
             userRoleName: "user",
             systemRoleName: "system"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -293,7 +295,7 @@ describe("TemplateChatWrapper", () => {
             userRoleName: "user",
             systemRoleName: "system"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory);
+        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
         expect(contextText.values).toMatchInlineSnapshot(`
           [
@@ -327,7 +329,8 @@ describe("TemplateChatWrapper", () => {
             userRoleName: "user",
             systemRoleName: "system"
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistory, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistory,
             availableFunctions: exampleFunctions
         });
 
@@ -387,7 +390,8 @@ describe("TemplateChatWrapper", () => {
                 " [[result: {{functionCallResult}}]]"
             ]
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistoryWithFunctionCalls, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistoryWithFunctionCalls,
             availableFunctions: exampleFunctions
         });
 
@@ -427,8 +431,7 @@ describe("TemplateChatWrapper", () => {
               "value": "
           model: ",
             },
-            "Hello!
-          [[call: func2({"message":"Hello","feeling":"good","words":1})]] [[result: {"yes":true,"message":"ok"}]]",
+            "Hello![[call: func2({"message":"Hello","feeling":"good","words":1})]] [[result: {"yes":true,"message":"ok"}]]",
             {
               "type": "specialTokensText",
               "value": "
@@ -456,7 +459,8 @@ describe("TemplateChatWrapper", () => {
                 "\nFunction result: {{functionCallResult}}\n"
             ]
         });
-        const {contextText} = chatWrapper.generateContextText(conversationHistoryWithFunctionCalls, {
+        const {contextText} = chatWrapper.generateContextState({
+            chatHistory: conversationHistoryWithFunctionCalls,
             availableFunctions: exampleFunctions
         });
 
@@ -498,7 +502,6 @@ describe("TemplateChatWrapper", () => {
           model: ",
             },
             "Hello!
-
           Call function: func2 with params {"message":"Hello","feeling":"good","words":1}.
           Function result: {"yes":true,"message":"ok"}
           ",
diff --git a/test/standalone/utils/LlamaText.test.ts b/test/standalone/utils/LlamaText.test.ts
index a67929b3..39a7e979 100644
--- a/test/standalone/utils/LlamaText.test.ts
+++ b/test/standalone/utils/LlamaText.test.ts
@@ -36,16 +36,8 @@ describe("utils", () => {
               ]
             `);
 
-            expect(text1.toJSON()).toMatchInlineSnapshot(`
-              [
-                "Hi there!",
-              ]
-            `);
-            expect(text2.toJSON()).toMatchInlineSnapshot(`
-              [
-                "Hi there!",
-              ]
-            `);
+            expect(text1.toJSON()).toMatchInlineSnapshot('"Hi there!"');
+            expect(text2.toJSON()).toMatchInlineSnapshot('"Hi there!"');
         });
 
         test("squash texts", async () => {
@@ -71,6 +63,23 @@ describe("utils", () => {
             `);
         });
 
+        test("empty text", async () => {
+            const text = LlamaText([
+                ""
+            ]);
+            expect(text.values.length).to.eql(0);
+            expect(text).toMatchInlineSnapshot(`
+              LlamaText []
+            `);
+            expect(text.toJSON()).toMatchInlineSnapshot('""');
+            expect(LlamaText.fromJSON("")).toMatchInlineSnapshot(`
+              LlamaText []
+            `);
+            expect(LlamaText.fromJSON([""])).toMatchInlineSnapshot(`
+              LlamaText []
+            `);
+        });
+
         test("sub texts flattening", async () => {
             const text1 = LlamaText([
                 "Hi ",
@@ -215,6 +224,7 @@ describe("utils", () => {
                 " Hi"
             ]);
             expect(text1.toString()).toMatchInlineSnapshot('"Hello there!Special textEOS Hi"');
+            expect(text1 + "").toMatchInlineSnapshot('"Hello there!Special textEOS Hi"');
         });
 
         test("toJSON", async () => {

From 0f9b7f38daa4c88b5f5c701a7dc7d3752e7b51bf Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:21:24 +0300
Subject: [PATCH 12/39] feat: model compatibility warnings

---
 src/bindings/Llama.ts             |  5 +++
 src/cli/utils/getReadablePath.ts  |  2 +-
 src/evaluator/LlamaModel.ts       | 66 ++++++++++++++++++++++++++++++-
 src/gguf/insights/GgufInsights.ts | 26 ++++++++++++
 4 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 7b7eff49..0c1712fa 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -256,6 +256,11 @@ export class Llama {
         await this._bindings.init();
     }
 
+    /** @internal */
+    public _log(level: LlamaLogLevel, message: string) {
+        this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
+    }
+
     /** @internal */
     private _onAddonLog(level: number, message: string) {
         const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal;
diff --git a/src/cli/utils/getReadablePath.ts b/src/cli/utils/getReadablePath.ts
index 807a0f78..473b5d5f 100644
--- a/src/cli/utils/getReadablePath.ts
+++ b/src/cli/utils/getReadablePath.ts
@@ -2,7 +2,7 @@ import os from "os";
 import path from "path";
 
 export function getReadablePath(fsPath: string) {
-    const resolvedPath = path.resolve(fsPath);
+    const resolvedPath = path.resolve(process.cwd(), fsPath);
 
     if (process.platform === "win32" || process.platform === "cygwin")
         return resolvedPath;
diff --git a/src/evaluator/LlamaModel.ts b/src/evaluator/LlamaModel.ts
index 5157158d..1feac17b 100644
--- a/src/evaluator/LlamaModel.ts
+++ b/src/evaluator/LlamaModel.ts
@@ -5,13 +5,14 @@ import {removeNullFields} from "../utils/removeNullFields.js";
 import {Token, Tokenizer} from "../types.js";
 import {AddonModel, ModelTypeDescription} from "../bindings/AddonTypes.js";
 import {DisposalPreventionHandle, DisposeGuard} from "../utils/DisposeGuard.js";
-import {LlamaLocks, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../bindings/types.js";
+import {LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../bindings/types.js";
 import {GgufFileInfo} from "../gguf/types/GgufFileInfoTypes.js";
 import {readGgufFileInfo} from "../gguf/readGgufFileInfo.js";
 import {GgufInsights} from "../gguf/insights/GgufInsights.js";
 import {GgufMetadataTokenizerTokenType} from "../gguf/types/GgufMetadataTypes.js";
 import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
 import {Writable} from "../utils/utilTypes.js";
+import {getReadablePath} from "../cli/utils/getReadablePath.js";
 import {LlamaContextOptions} from "./LlamaContext/types.js";
 import {LlamaContext} from "./LlamaContext/LlamaContext.js";
 import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "./LlamaEmbeddingContext.js";
@@ -133,6 +134,7 @@ export class LlamaModel {
     /** @internal */ public readonly _model: AddonModel;
     /** @internal */ public readonly _backendModelDisposeGuard: DisposeGuard;
     /** @internal */ private readonly _tokens: LlamaModelTokens;
+    /** @internal */ private readonly _modelPath: string;
     /** @internal */ private readonly _fileInfo: GgufFileInfo;
     /** @internal */ private readonly _fileInsights: GgufInsights;
     /** @internal */ private readonly _gpuLayers: number;
@@ -163,11 +165,12 @@ export class LlamaModel {
     }) {
         this._llama = _llama;
         this._fileInfo = _fileInfo;
+        this._modelPath = path.resolve(process.cwd(), modelPath);
         this._fileInsights = _fileInsights;
         this._gpuLayers = gpuLayers;
         this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]);
         this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
-        this._model = new this._llama._bindings.AddonModel(path.resolve(process.cwd(), modelPath), removeNullFields({
+        this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
             addonExports: this._llama._bindings,
             gpuLayers,
             vocabOnly,
@@ -401,6 +404,48 @@ export class LlamaModel {
         });
     }
 
+    /**
+     * Get warnings about the model file that would affect its usage.
+     *
+     * These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
+     */
+    public getWarnings() {
+        this._ensureNotDisposed();
+
+        const warnings = this._fileInsights.getWarnings(this._modelPath);
+        const modelFilePathText = `("${getReadablePath(this._modelPath)}")`;
+
+        try {
+            const specialTokenString = this.tokens.bosString || this.tokens.eosString || this.tokens.infill.eotString;
+            if (specialTokenString != null && specialTokenString !== "") {
+                const beforeTextNoSpecialTokens = "some test text here";
+                const afterTextNoSpecialTokens = this.detokenize(this.tokenize(beforeTextNoSpecialTokens, false, "trimLeadingSpace"));
+
+                if (beforeTextNoSpecialTokens !== afterTextNoSpecialTokens)
+                    warnings.push(
+                        `Using this model ${modelFilePathText} to tokenize text and then detokenize it resulted in a different text. ` +
+                        "There might be an issue with the model or the tokenizer implementation. " +
+                        "Using this model may not work as intended"
+                    );
+
+                const beforeTextWithSpecialTokens = specialTokenString + beforeTextNoSpecialTokens;
+                const afterTextWithSpecialTokens = this.detokenize(this.tokenize(beforeTextWithSpecialTokens, true, "trimLeadingSpace"), true);
+
+                if (beforeTextWithSpecialTokens !== afterTextWithSpecialTokens)
+                    warnings.push(
+                        `Using this model ${modelFilePathText} to tokenize text with special tokens and then ` +
+                        "detokenize it resulted in a different text. " +
+                        "There might be an issue with the model or the tokenizer implementation. " +
+                        "Using this model may not work as intended"
+                    );
+            }
+        } catch (err) {
+            // do nothing
+        }
+
+        return warnings;
+    }
+
     /** @hidden `ModelTypeDescription` type alias is too long in the documentation */
     public get typeDescription(): ModelTypeDescription {
         this._ensureNotDisposed();
@@ -498,12 +543,23 @@ export class LlamaModel {
         const modelCreationMemoryReservation = modelOptions.ignoreMemorySafetyChecks
             ? null
             : _llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+        const loggedWarnings = new Set<string>();
 
         function onAbort() {
             model._model.abortActiveModelLoad();
             loadSignal?.removeEventListener("abort", onAbort);
         }
 
+        function logWarnings(warnings: string[]) {
+            for (const warning of warnings) {
+                if (loggedWarnings.has(warning))
+                    continue;
+
+                _llama._log(LlamaLogLevel.warn, warning);
+                loggedWarnings.add(warning);
+            }
+        }
+
         if (loadSignal != null) {
             if (loadSignal.aborted)
                 throw loadSignal.reason;
@@ -511,6 +567,8 @@ export class LlamaModel {
             loadSignal.addEventListener("abort", onAbort);
         }
 
+        logWarnings(ggufInsights.getWarnings(modelOptions.modelPath));
+
         try {
             const modelLoaded = await model._model.init();
 
@@ -524,6 +582,8 @@ export class LlamaModel {
 
             loadSignal?.removeEventListener("abort", onAbort);
 
+            logWarnings(model.getWarnings());
+
             if (loraOptions != null && loraOptions.adapters.length > 0) {
                 const loraThreads = loraOptions.threads ?? defaultLoraThreads;
                 let loadedAdapters = 0;
@@ -553,6 +613,8 @@ export class LlamaModel {
                         throw loadSignal.reason;
                     }
                 }
+
+                logWarnings(model.getWarnings());
             } else if (loraOptions?.onLoadProgress != null) {
                 try {
                     loraOptions.onLoadProgress(1);
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index c5502415..51240709 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -4,6 +4,7 @@ import {getDefaultContextBatchSize, getDefaultContextSequences} from "../../eval
 import {GgufFileInfo} from "../types/GgufFileInfoTypes.js";
 import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {GgufArchitectureType} from "../types/GgufMetadataTypes.js";
+import {getReadablePath} from "../../cli/utils/getReadablePath.js";
 import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js";
 
 export type GgufInsightsResourceRequirements = {
@@ -26,6 +27,31 @@ export class GgufInsights {
         this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
     }
 
+    /**
+     * Get warnings about the model file that would affect its usage.
+     *
+     * Most of these warnings are also generated by `llama.cpp`
+     */
+    public getWarnings(modelFilePath?: string) {
+        const warnings: string[] = [];
+        const modelFilePathText = (modelFilePath != null && modelFilePath !== "")
+            ? ` ("${getReadablePath(modelFilePath)}")`
+            : "";
+
+        if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" &&
+            this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null
+        ) {
+            // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'"
+            warnings.push(
+                `This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` +
+                "This may cause incorrect tokenization and thus degrade the generation quality. " +
+                "Consider using a newer model or regenerating this GGUF model file"
+            );
+        }
+
+        return warnings;
+    }
+
     public get ggufFileInfo(): GgufFileInfo {
         return this._ggufFileInfo;
     }

From ca93c0be95cd41f8e534a9267e04f176699c0451 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:23:26 +0300
Subject: [PATCH 13/39] fix: improve CUDA detection on Windows

---
 src/bindings/utils/detectAvailableComputeLayers.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bindings/utils/detectAvailableComputeLayers.ts b/src/bindings/utils/detectAvailableComputeLayers.ts
index 66356d70..dbe17ab5 100644
--- a/src/bindings/utils/detectAvailableComputeLayers.ts
+++ b/src/bindings/utils/detectAvailableComputeLayers.ts
@@ -37,7 +37,8 @@ async function detectCudaSupport({
     platform: BinaryPlatform
 }) {
     if (platform === "win") {
-        const librarySearchPaths = await getCudaInstallationPaths({platform});
+        const librarySearchPaths = (await getCudaInstallationPaths({platform}))
+            .flatMap((cudaInstallationPath) => [cudaInstallationPath, path.join(cudaInstallationPath, "bin")]);
         const windir = getWindir();
 
         const [

From c7957d38bfbf1c1b9634714abb6e51cfe286093a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:24:59 +0300
Subject: [PATCH 14/39] fix: bugs

---
 .../utils/LlamaChatSessionPromptCompletionEngine.ts          | 2 +-
 src/gguf/fileReaders/GgufFsFileReader.ts                     | 3 ++-
 src/gguf/types/GgufMetadataTypes.ts                          | 5 ++++-
 templates/electron-typescript-react/package.json             | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
index 29d61891..09850689 100644
--- a/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
+++ b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
@@ -130,7 +130,7 @@ export class LlamaChatSessionPromptCompletionEngine {
         const existingCompletion = completionCache.getCompletion(prompt);
         const promptToComplete = prompt + (existingCompletion ?? "");
 
-        const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete).length;
+        const currentPromptTokens = this._chatSession.model.tokenize(promptToComplete, false, "trimLeadingSpace").length;
         const leftTokens = Math.max(0, this._maxPreloadTokens - currentPromptTokens);
 
         if (leftTokens === 0)
diff --git a/src/gguf/fileReaders/GgufFsFileReader.ts b/src/gguf/fileReaders/GgufFsFileReader.ts
index 83f58f26..d3c9af2a 100644
--- a/src/gguf/fileReaders/GgufFsFileReader.ts
+++ b/src/gguf/fileReaders/GgufFsFileReader.ts
@@ -1,4 +1,5 @@
 import fs from "node:fs/promises";
+import path from "node:path";
 import {withLock} from "lifecycle-utils";
 import {GgufReadOffset} from "../utils/GgufReadOffset.js";
 import {defaultExtraAllocationSize} from "../consts.js";
@@ -15,7 +16,7 @@ export class GgufFsFileReader extends GgufFileReader {
 
     public constructor({filePath, signal}: GgufFsFileReaderOptions) {
         super();
-        this.filePath = filePath;
+        this.filePath = path.resolve(process.cwd(), filePath);
         this._signal = signal;
     }
 
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index b644ecd8..fbeacf20 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -175,7 +175,10 @@ export const enum GgufMetadataTokenizerTokenType {
 
 export type GgufMetadataTokenizer = {
     readonly ggml: {
-        readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | "replit" | "rwkv" | string,
+        readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | string,
+        readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "mpt" |
+            "starcoder" | "gpt-2" | "jina-es" | "jina-de" | "jina-v2-es" | "jina-v2-de" | "refact" | "command-r" | "qwen2" | "stablelm2" |
+            "olmo" | "dbrx" | "smaug-bpe" | string,
         readonly tokens: readonly string[],
         readonly token_type: GgufMetadataTokenizerTokenType[],
         readonly token_type_count?: number,
diff --git a/templates/electron-typescript-react/package.json b/templates/electron-typescript-react/package.json
index 1e053ba3..4aa949bf 100644
--- a/templates/electron-typescript-react/package.json
+++ b/templates/electron-typescript-react/package.json
@@ -14,7 +14,7 @@
     "lint": "npm run lint:eslint",
     "lint:eslint": "eslint --ext .js --ext .ts --report-unused-disable-directives .",
     "format": "npm run lint:eslint -- --fix",
-    "clean": "rm -rf ./node_modules ./dist ./dist-electron ./models"
+    "clean": "rm -rf ./node_modules ./dist ./dist-electron ./release ./models"
   },
   "dependencies": {
     "birpc": "^0.2.17",

From d664277c9d9b2acba409ec4d5b9c626539fd7f4c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:28:11 +0300
Subject: [PATCH 15/39] feat: parallel model downloads

---
 package-lock.json                  | 17 +++++++++++++----
 package.json                       |  2 +-
 src/utils/createModelDownloader.ts | 16 +++++++++++++---
 test/utils/modelFiles.ts           |  3 ++-
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index b03b3d69..ca643f2b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -22,7 +22,7 @@
         "filenamify": "^6.0.0",
         "fs-extra": "^11.2.0",
         "ignore": "^5.3.1",
-        "ipull": "^3.1.1",
+        "ipull": "^3.3.0",
         "is-unicode-supported": "^2.0.0",
         "lifecycle-utils": "^1.4.1",
         "log-symbols": "^5.1.0",
@@ -3266,6 +3266,14 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/@supercharge/promise-pool": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-3.2.0.tgz",
+      "integrity": "sha512-pj0cAALblTZBPtMltWOlZTQSLT07jIaFNeM8TWoJD1cQMgDB9mcMlVMoetiB35OzNJpqQ2b+QEtwiR9f20mADg==",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/@tinyhttp/content-disposition": {
       "version": "2.2.0",
       "resolved": "https://registry.npmjs.org/@tinyhttp/content-disposition/-/content-disposition-2.2.0.tgz",
@@ -8021,10 +8029,11 @@
       }
     },
     "node_modules/ipull": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.1.1.tgz",
-      "integrity": "sha512-qjItzwEGkHzUEIvN+VEhrURz3RlpSpqJpJq7FTGMwyUZYGH2Qkk4uLMfl7JPndTFFcsDY+qpUvEXTosAYLrWdA==",
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.3.0.tgz",
+      "integrity": "sha512-Q90FhMmHXxVTc2jCfbjJIU8JaaymoT6a6G0K6vemEGUa8PTt24LY5fVCH//KSMrJNbZmJSQ0lRt1ac3iwo8gOA==",
       "dependencies": {
+        "@supercharge/promise-pool": "^3.2.0",
         "@tinyhttp/content-disposition": "^2.2.0",
         "async-retry": "^1.3.3",
         "chalk": "^5.3.0",
diff --git a/package.json b/package.json
index 85f48943..235d53b3 100644
--- a/package.json
+++ b/package.json
@@ -165,7 +165,7 @@
     "filenamify": "^6.0.0",
     "fs-extra": "^11.2.0",
     "ignore": "^5.3.1",
-    "ipull": "^3.1.1",
+    "ipull": "^3.3.0",
     "is-unicode-supported": "^2.0.0",
     "lifecycle-utils": "^1.4.1",
     "log-symbols": "^5.1.0",
diff --git a/src/utils/createModelDownloader.ts b/src/utils/createModelDownloader.ts
index 0cdaa46b..169b190b 100644
--- a/src/utils/createModelDownloader.ts
+++ b/src/utils/createModelDownloader.ts
@@ -38,7 +38,14 @@ export type ModelDownloaderOptions = {
      *
      * Defaults to `true`.
      */
-    deleteTempFileOnCancel?: boolean
+    deleteTempFileOnCancel?: boolean,
+
+    /**
+     * The number of parallel downloads to use when downloading split files.
+     *
+     * Defaults to `4`.
+     */
+    parallelDownloads?: number
 };
 
 /**
@@ -89,6 +96,7 @@ export class ModelDownloader {
     /** @internal */ private readonly _onProgress?: ModelDownloaderOptions["onProgress"];
     /** @internal */ private readonly _deleteTempFileOnCancel: boolean;
     /** @internal */ private readonly _skipExisting: boolean;
+    /** @internal */ private readonly _parallelDownloads: number;
 
     /** @internal */ private _downloader?: DownloadEngineMultiDownload | DownloadEngineNodejs;
     /** @internal */ private _specificFileDownloaders: DownloadEngineNodejs[] = [];
@@ -98,7 +106,7 @@ export class ModelDownloader {
 
     private constructor({
         modelUrl, dirPath = cliModelsDirectory, fileName, headers, showCliProgress = false, onProgress, deleteTempFileOnCancel = true,
-        skipExisting = true
+        skipExisting = true, parallelDownloads = 4
     }: ModelDownloaderOptions) {
         if (modelUrl == null || dirPath == null)
             throw new Error("modelUrl and dirPath cannot be null");
@@ -111,6 +119,7 @@ export class ModelDownloader {
         this._onProgress = onProgress;
         this._deleteTempFileOnCancel = deleteTempFileOnCancel;
         this._skipExisting = skipExisting;
+        this._parallelDownloads = parallelDownloads;
     }
 
     /**
@@ -291,7 +300,8 @@ export class ModelDownloader {
 
         this._downloader = await downloadSequence(
             {
-                cliProgress: this._showCliProgress
+                cliProgress: this._showCliProgress,
+                parallelDownloads: this._parallelDownloads
             },
             ...partDownloads
         );
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts
index c0d03a08..de9b84af 100644
--- a/test/utils/modelFiles.ts
+++ b/test/utils/modelFiles.ts
@@ -85,7 +85,8 @@ export async function downloadAllModels() {
     if (pendingDownloads.length > 0) {
         console.info(`Downloading ${pendingDownloads.length} model${pendingDownloads.length === 1 ? "" : "s"}`);
         const downloader = await downloadSequence({
-            cliProgress: true
+            cliProgress: true,
+            parallelDownloads: 4
         }, ...pendingDownloads);
         await downloader.download();
     }

From e624b5fc3515deb61e04b4982e208e2535236373 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:29:24 +0300
Subject: [PATCH 16/39] fix: small performance improvement

---
 src/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.ts b/src/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.ts
index 67f4600f..86b0d35f 100644
--- a/src/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.ts
+++ b/src/utils/gbnfJson/getGbnfGrammarForGbnfJsonSchema.ts
@@ -16,5 +16,5 @@ export function getGbnfGrammarForGbnfJsonSchema(schema: GbnfJsonSchema, {
     const rootTerminal = getGbnfJsonTerminalForGbnfJsonSchema(schema, grammarGenerator, scopeState);
     const rootGrammar = rootTerminal.getGrammar(grammarGenerator);
 
-    return grammarGenerator.generateGbnfFile(rootGrammar + " [\\n]".repeat(4) + " [\\n]*");
+    return grammarGenerator.generateGbnfFile(rootGrammar + ` "${"\\n".repeat(4)}"` + " [\\n]*");
 }

From b79a9502ae57bf846d1ad8ebd8de7a4a2eb8a650 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 3 Jun 2024 21:34:52 +0300
Subject: [PATCH 17/39] test: update tests

---
 .../FunctionCallGrammar.test.ts               | 63 --------------
 .../llamaEvaluator/LlamaGrammar.test.ts       | 10 +--
 .../functionCallGrammar.test.ts               | 87 +++++++++++++++++++
 3 files changed, 92 insertions(+), 68 deletions(-)
 delete mode 100644 test/standalone/llamaEvaluator/FunctionCallGrammar.test.ts
 create mode 100644 test/standalone/llamaEvaluator/functionCallGrammar.test.ts

diff --git a/test/standalone/llamaEvaluator/FunctionCallGrammar.test.ts b/test/standalone/llamaEvaluator/FunctionCallGrammar.test.ts
deleted file mode 100644
index 31e2d6db..00000000
--- a/test/standalone/llamaEvaluator/FunctionCallGrammar.test.ts
+++ /dev/null
@@ -1,63 +0,0 @@
-import {describe, expect, test} from "vitest";
-import {Llama2ChatWrapper} from "../../../src/index.js";
-import {FunctionCallGrammar} from "../../../src/evaluator/LlamaChat/utils/FunctionCallGrammar.js";
-import {getTestLlama} from "../../utils/getTestLlama.js";
-
-
-describe("grammar for functions", () => {
-    test("object", async () => {
-        const chatWrapper = new Llama2ChatWrapper();
-        const llama = await getTestLlama();
-        const grammar = new FunctionCallGrammar(llama, {
-            func1: {
-
-            },
-            func2: {
-                params: {
-                    type: "object",
-                    properties: {
-                        message: {
-                            type: "string"
-                        },
-                        feeling: {
-                            enum: ["good", "bad"]
-                        },
-                        words: {
-                            type: "number"
-                        }
-                    }
-                }
-            },
-            func3: {
-                description: "Some description here",
-                params: {
-                    type: "array",
-                    items: {
-                        type: "string"
-                    }
-                }
-            }
-        } as const, chatWrapper, true);
-
-        expect(grammar.grammar).toMatchInlineSnapshot(
-            `
-          "root ::= [ ]? "[[call: " rule10 ")]]" [\\n] [\\n] [\\n] [\\n] [\\n]*
-          whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
-          string-rule ::= "\\"" ([^"\\\\\\x7F\\x00-\\x1F] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* "\\""
-          rule1 ::= "\\"good\\""
-          rule2 ::= "\\"bad\\""
-          rule3 ::= ( rule1 | rule2 )
-          fractional-number-rule ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)?
-          whitespace-b-0-4-rule ::= ([\\n] | [ ]?)
-          rule0 ::= "{" whitespace-b-1-4-rule "\\"message\\"" ":" [ ]? string-rule "," whitespace-b-1-4-rule "\\"feeling\\"" ":" [ ]? rule3 "," whitespace-b-1-4-rule "\\"words\\"" ":" [ ]? fractional-number-rule whitespace-b-0-4-rule "}"
-          rule5 ::= ( string-rule ) ( "," whitespace-b-1-4-rule string-rule )*
-          rule6 ::= ( string-rule )?
-          rule4 ::= "[" whitespace-b-1-4-rule ( rule5 | rule6 ) whitespace-b-0-4-rule "]"
-          rule7 ::= "func1("
-          rule8 ::= "func2(" rule0
-          rule9 ::= "func3(" rule4
-          rule10 ::= ( rule7 | rule8 | rule9 )"
-        `
-        );
-    });
-});
diff --git a/test/standalone/llamaEvaluator/LlamaGrammar.test.ts b/test/standalone/llamaEvaluator/LlamaGrammar.test.ts
index 6eab7b20..1d6d7d41 100644
--- a/test/standalone/llamaEvaluator/LlamaGrammar.test.ts
+++ b/test/standalone/llamaEvaluator/LlamaGrammar.test.ts
@@ -106,7 +106,7 @@ describe("grammar for JSON schema", () => {
         };
 
         expect(grammar.grammar).toMatchInlineSnapshot(`
-          "root ::= "{" whitespace-b-1-4-rule "\\"message\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"numberOfWordsInMessage\\"" ":" [ ]? integer-number-rule "," whitespace-b-1-4-rule "\\"feelingGoodPercentage\\"" ":" [ ]? fractional-number-rule "," whitespace-b-1-4-rule "\\"feelingGood\\"" ":" [ ]? boolean-rule "," whitespace-b-1-4-rule "\\"feelingOverall\\"" ":" [ ]? rule5 "," whitespace-b-1-4-rule "\\"verbsInMessage\\"" ":" [ ]? rule6 whitespace-b-0-4-rule "}" [\\n] [\\n] [\\n] [\\n] [\\n]*
+          "root ::= "{" whitespace-b-1-4-rule "\\"message\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"numberOfWordsInMessage\\"" ":" [ ]? integer-number-rule "," whitespace-b-1-4-rule "\\"feelingGoodPercentage\\"" ":" [ ]? fractional-number-rule "," whitespace-b-1-4-rule "\\"feelingGood\\"" ":" [ ]? boolean-rule "," whitespace-b-1-4-rule "\\"feelingOverall\\"" ":" [ ]? rule5 "," whitespace-b-1-4-rule "\\"verbsInMessage\\"" ":" [ ]? rule6 whitespace-b-0-4-rule "}" "\\n\\n\\n\\n" [\\n]*
           whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
           string-rule ::= "\\"" ([^"\\\\\\x7F\\x00-\\x1F] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* "\\""
           null-rule ::= "null"
@@ -212,7 +212,7 @@ describe("grammar for JSON schema", () => {
         };
 
         expect(grammar.grammar).toMatchInlineSnapshot(`
-          "root ::= "[" whitespace-b-1-4-rule ( rule2 | rule3 ) whitespace-b-0-4-rule "]" [\\n] [\\n] [\\n] [\\n] [\\n]*
+          "root ::= "[" whitespace-b-1-4-rule ( rule2 | rule3 ) whitespace-b-0-4-rule "]" "\\n\\n\\n\\n" [\\n]*
           whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
           string-rule ::= "\\"" ([^"\\\\\\x7F\\x00-\\x1F] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* "\\""
           whitespace-b-0-4-rule ::= ([\\n] | [ ]?)
@@ -304,7 +304,7 @@ describe("grammar for JSON schema", () => {
         };
 
         expect(grammar.grammar).toMatchInlineSnapshot(`
-          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" [\\n] [\\n] [\\n] [\\n] [\\n]*
+          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" "\\n\\n\\n\\n" [\\n]*
           whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
           rule0 ::= "\\"good\\""
           null-rule ::= "null"
@@ -383,7 +383,7 @@ describe("grammar for JSON schema", () => {
         };
 
         expect(grammar.grammar).toMatchInlineSnapshot(`
-          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" [\\n] [\\n] [\\n] [\\n] [\\n]*
+          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" "\\n\\n\\n\\n" [\\n]*
           whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
           rule0 ::= "\\"good\\""
           null-rule ::= "null"
@@ -463,7 +463,7 @@ describe("grammar for JSON schema", () => {
         };
 
         expect(grammar.grammar).toMatchInlineSnapshot(`
-          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" [\\n] [\\n] [\\n] [\\n] [\\n]*
+          "root ::= "{" whitespace-b-1-4-rule "\\"onlyPositiveText\\"" ":" [ ]? "true" "," whitespace-b-1-4-rule "\\"onlyNegativeText\\"" ":" [ ]? "false" "," whitespace-b-1-4-rule "\\"onlyVibe\\"" ":" [ ]? rule0 "," whitespace-b-1-4-rule "\\"onlyNumber\\"" ":" [ ]? "10" "," whitespace-b-1-4-rule "\\"worstThing\\"" ":" [ ]? null-rule "," whitespace-b-1-4-rule "\\"withNewLine\\"" ":" [ ]? rule1 "," whitespace-b-1-4-rule "\\"withQuotes\\"" ":" [ ]? rule2 whitespace-b-0-4-rule "}" "\\n\\n\\n\\n" [\\n]*
           whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
           rule0 ::= "\\"good\\""
           null-rule ::= "null"
diff --git a/test/standalone/llamaEvaluator/functionCallGrammar.test.ts b/test/standalone/llamaEvaluator/functionCallGrammar.test.ts
new file mode 100644
index 00000000..736b6625
--- /dev/null
+++ b/test/standalone/llamaEvaluator/functionCallGrammar.test.ts
@@ -0,0 +1,87 @@
+import {describe, expect, test} from "vitest";
+import {GbnfJsonSchema, Llama2ChatWrapper} from "../../../src/index.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {FunctionCallNameGrammar} from "../../../src/evaluator/LlamaChat/utils/FunctionCallNameGrammar.js";
+import {FunctionCallParamsGrammar} from "../../../src/evaluator/LlamaChat/utils/FunctionCallParamsGrammar.js";
+
+
+describe("grammar for functions", () => {
+    const functions = {
+        func1: {
+
+        },
+        func2: {
+            params: {
+                type: "object",
+                properties: {
+                    message: {
+                        type: "string"
+                    },
+                    feeling: {
+                        enum: ["good", "bad"]
+                    },
+                    words: {
+                        type: "integer"
+                    }
+                }
+            } satisfies GbnfJsonSchema
+        },
+        func3: {
+            description: "Some description here",
+            params: {
+                type: "array",
+                items: {
+                    type: "string"
+                }
+            } satisfies GbnfJsonSchema
+        }
+    } as const;
+
+    test("FunctionCallNameGrammar", async () => {
+        const chatWrapper = new Llama2ChatWrapper();
+        const llama = await getTestLlama();
+        const grammar = new FunctionCallNameGrammar(llama, functions, chatWrapper);
+
+        expect(grammar.grammar).toMatchInlineSnapshot(
+            `
+          "root ::= [ ]? rule3 [\\n]
+          rule0 ::= "func1"
+          rule1 ::= "func2"
+          rule2 ::= "func3"
+          rule3 ::= ( rule0 | rule1 | rule2 )"
+        `
+        );
+    });
+
+    test("FunctionCallParamsGrammar", async () => {
+        const chatWrapper = new Llama2ChatWrapper();
+        const llama = await getTestLlama();
+        const grammar1 = new FunctionCallParamsGrammar(llama, functions, chatWrapper, "func2", functions.func2.params);
+
+        expect(grammar1.grammar).toMatchInlineSnapshot(
+            `
+          "root ::= "{" whitespace-b-1-4-rule "\\"message\\"" ":" [ ]? string-rule "," whitespace-b-1-4-rule "\\"feeling\\"" ":" [ ]? rule2 "," whitespace-b-1-4-rule "\\"words\\"" ":" [ ]? integer-number-rule whitespace-b-0-4-rule "}" "\\n\\n\\n\\n"
+          whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
+          string-rule ::= "\\"" ([^"\\\\\\x7F\\x00-\\x1F] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* "\\""
+          rule0 ::= "\\"good\\""
+          rule1 ::= "\\"bad\\""
+          rule2 ::= ( rule0 | rule1 )
+          integer-number-rule ::= ("-"? ([0-9] | [1-9] [0-9]*))
+          whitespace-b-0-4-rule ::= ([\\n] | [ ]?)"
+        `
+        );
+
+        const grammar2 = new FunctionCallParamsGrammar(llama, functions, chatWrapper, "func3", functions.func3.params);
+
+        expect(grammar2.grammar).toMatchInlineSnapshot(
+            `
+          "root ::= "[" whitespace-b-1-4-rule ( rule0 | rule1 ) whitespace-b-0-4-rule "]" "\\n\\n\\n\\n"
+          string-rule ::= "\\"" ([^"\\\\\\x7F\\x00-\\x1F] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* "\\""
+          whitespace-b-1-4-rule ::= ([\\n] ("    " | "\\t") | [ ]?)
+          rule0 ::= ( string-rule ) ( "," whitespace-b-1-4-rule string-rule )*
+          rule1 ::= ( string-rule )?
+          whitespace-b-0-4-rule ::= ([\\n] | [ ]?)"
+        `
+        );
+    });
+});

From 859b2a2285da32f0cb5dfa51a86e40958f2350eb Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 4 Jun 2024 01:21:44 +0300
Subject: [PATCH 18/39] feat: support Functionary `v2.llama3`

---
 src/chatWrappers/FunctionaryChatWrapper.ts    | 248 ++++-
 src/chatWrappers/utils/resolveChatWrapper.ts  |  49 +-
 .../FunctionaryChatWrapper.test.ts            | 951 ++++++++++++------
 .../utils/resolveChatWrapper.test.ts          |  23 +-
 test/standalone/parseModelFileName.test.ts    |  11 +
 5 files changed, 955 insertions(+), 327 deletions(-)

diff --git a/src/chatWrappers/FunctionaryChatWrapper.ts b/src/chatWrappers/FunctionaryChatWrapper.ts
index a40ea578..5e6288bc 100644
--- a/src/chatWrappers/FunctionaryChatWrapper.ts
+++ b/src/chatWrappers/FunctionaryChatWrapper.ts
@@ -6,43 +6,220 @@ import {
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctionsDocumentationGenerator.js";
 
+type FunctionaryChatWrapperVariation = "v2" | "v2.llama3";
+
 // source: https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v2.txt
 export class FunctionaryChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Functionary";
+    public readonly variation: FunctionaryChatWrapperVariation;
+
+    public override readonly settings: ChatWrapperSettings;
+
+    public constructor({
+        variation = "v2.llama3"
+    }: {
+        variation?: FunctionaryChatWrapperVariation
+    } = {}) {
+        super();
+
+        this.variation = variation;
+
+        if (variation === "v2.llama3")
+            this.settings = {
+                supportsSystemMessages: true,
+                functions: {
+                    call: {
+                        optionalPrefixSpace: true,
+                        prefix: LlamaText(new SpecialTokensText("<|reserved_special_token_249|>")),
+                        paramsPrefix: LlamaText(new SpecialTokensText("\n")),
+                        suffix: ""
+                    },
+                    result: {
+                        prefix: LlamaText([
+                            new SpecialTokensText("<|start_header_id|>tool<|end_header_id|>\n\nname="),
+                            "{{functionName}}",
+                            new SpecialTokensText("\n")
+                        ]),
+                        suffix: LlamaText(new SpecialToken("EOT"))
+                    },
+                    parallelism: {
+                        call: {
+                            sectionPrefix: "",
+                            betweenCalls: "",
+                            sectionSuffix: LlamaText(new SpecialToken("EOT"))
+                        },
+                        result: {
+                            sectionPrefix: "",
+                            betweenResults: "",
+                            sectionSuffix: ""
+                        }
+                    }
+                }
+            };
+        else
+            this.settings = {
+                supportsSystemMessages: true,
+                functions: {
+                    call: {
+                        optionalPrefixSpace: true,
+                        prefix: LlamaText(new SpecialTokensText("\n<|from|>assistant\n<|recipient|>")),
+                        paramsPrefix: LlamaText(new SpecialTokensText("\n<|content|>")),
+                        suffix: ""
+                    },
+                    result: {
+                        prefix: LlamaText([
+                            new SpecialTokensText("\n<|from|>"),
+                            "{{functionName}}",
+                            new SpecialTokensText("\n<|recipient|>all\n<|content|>")
+                        ]),
+                        suffix: ""
+                    },
+                    parallelism: {
+                        call: {
+                            sectionPrefix: "",
+                            betweenCalls: "\n",
+                            sectionSuffix: LlamaText(new SpecialTokensText("<|stop|>"))
+                        },
+                        result: {
+                            sectionPrefix: "",
+                            betweenResults: "",
+                            sectionSuffix: ""
+                        }
+                    }
+                }
+            };
+    }
+    public override generateContextState({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        if (this.variation === "v2.llama3")
+            return this._generateContextStateV2Llama3({chatHistory, availableFunctions, documentFunctionParams});
+
+        return this._generateContextStateV2({chatHistory, availableFunctions, documentFunctionParams});
+    }
+
+    /** @internal */
+    private _generateContextStateV2Llama3({
+        chatHistory, availableFunctions, documentFunctionParams
+    }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
+        const historyWithFunctions = this.addAvailableFunctionsSystemMessageToHistory(chatHistory, availableFunctions, {
+            documentParams: documentFunctionParams
+        });
+
+        const contextText = LlamaText(
+            new SpecialToken("BOS"),
+            historyWithFunctions.map((item, index) => {
+                const isLastItem = index === historyWithFunctions.length - 1;
+
+                if (item.type === "system") {
+                    if (item.text.length === 0)
+                        return "";
+
+                    return LlamaText([
+                        new SpecialTokensText("<|start_header_id|>system<|end_header_id|>\n\n"),
+                        LlamaText.fromJSON(item.text),
+                        new SpecialToken("EOT")
+                    ]);
+                } else if (item.type === "user") {
+                    return LlamaText([
+                        new SpecialTokensText("<|start_header_id|>user<|end_header_id|>\n\n"),
+                        item.text,
+                        new SpecialToken("EOT")
+                    ]);
+                } else if (item.type === "model") {
+                    if (isLastItem && item.response.length === 0)
+                        return LlamaText([
+                            new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n")
+                        ]);
+
+                    const res: LlamaText[] = [];
+                    const pendingFunctionCalls: LlamaText[] = [];
+                    const pendingFunctionResults: LlamaText[] = [];
 
-    public override readonly settings = {
-        supportsSystemMessages: true,
-        functions: {
-            call: {
-                optionalPrefixSpace: true,
-                prefix: LlamaText(new SpecialTokensText("\n<|from|>assistant\n<|recipient|>")),
-                paramsPrefix: LlamaText(new SpecialTokensText("\n<|content|>")),
-                suffix: ""
-            },
-            result: {
-                prefix: LlamaText([
-                    new SpecialTokensText("\n<|from|>"),
-                    "{{functionName}}",
-                    new SpecialTokensText("\n<|recipient|>all\n<|content|>")
-                ]),
-                suffix: ""
-            },
-            parallelism: {
-                call: {
-                    sectionPrefix: "",
-                    betweenCalls: "\n",
-                    sectionSuffix: LlamaText(new SpecialTokensText("<|stop|>"))
-                },
-                result: {
-                    sectionPrefix: "",
-                    betweenResults: "",
-                    sectionSuffix: ""
+                    const addPendingFunctions = () => {
+                        if (pendingFunctionResults.length === 0)
+                            return;
+
+                        res.push(LlamaText(pendingFunctionCalls));
+                        res.push(LlamaText(new SpecialToken("EOT")));
+                        res.push(LlamaText(pendingFunctionResults));
+
+                        pendingFunctionResults.length = 0;
+                    };
+
+                    for (let index = 0; index < item.response.length; index++) {
+                        const response = item.response[index];
+                        const isLastResponse = index === item.response.length - 1;
+
+                        if (typeof response === "string") {
+                            addPendingFunctions();
+                            res.push(
+                                LlamaText([
+                                    new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n"),
+                                    response,
+                                    (isLastItem && isLastResponse)
+                                        ? LlamaText([])
+                                        : new SpecialToken("EOT")
+                                ])
+                            );
+                        } else if (isChatModelResponseFunctionCall(response)) {
+                            pendingFunctionCalls.push(
+                                response.rawCall != null
+                                    ? LlamaText.fromJSON(response.rawCall)
+                                    : LlamaText([
+                                        new SpecialTokensText("<|reserved_special_token_249|>"),
+                                        response.name,
+                                        new SpecialTokensText("\n"),
+                                        response.params === undefined
+                                            ? ""
+                                            : JSON.stringify(response.params)
+                                    ])
+                            );
+                            pendingFunctionResults.push(
+                                LlamaText([
+                                    new SpecialTokensText("<|start_header_id|>tool<|end_header_id|>\n\nname="),
+                                    response.name,
+                                    new SpecialTokensText("\n"),
+                                    response.result === undefined
+                                        ? "" // "void"
+                                        : JSON.stringify(response.result),
+                                    new SpecialToken("EOT")
+                                ])
+                            );
+                        } else
+                            void (response satisfies never);
+                    }
+
+                    addPendingFunctions();
+
+                    if (isLastItem && (res.length === 0 || typeof item.response[item.response.length - 1] !== "string"))
+                        res.push(
+                            LlamaText([
+                                new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n")
+                            ])
+                        );
+
+                    return LlamaText(res);
                 }
-            }
-        }
-    } satisfies ChatWrapperSettings;
 
-    public override generateContextState({
+                void (item satisfies never);
+                return "";
+            })
+        );
+
+        return {
+            contextText,
+            stopGenerationTriggers: [
+                LlamaText(new SpecialToken("EOS")),
+                LlamaText(new SpecialToken("EOT")),
+                LlamaText("<|eot_id|>"),
+                LlamaText("<|end_of_text|>")
+            ]
+        };
+    }
+
+    /** @internal */
+    private _generateContextStateV2({
         chatHistory, availableFunctions, documentFunctionParams
     }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState {
         const hasFunctions = Object.keys(availableFunctions ?? {}).length > 0;
@@ -313,4 +490,13 @@ export class FunctionaryChatWrapper extends ChatWrapper {
 
         return res;
     }
+
+    /** @internal */
+    public static override _getOptionConfigurationsToTestIfCanSupersedeJinjaTemplate() {
+        return [{
+            variation: "v2.llama3"
+        }, {
+            variation: "v2"
+        }] satisfies Partial<ConstructorParameters<typeof this>[0]>[];
+    }
 }
diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts
index 784b2888..6e43f8b2 100644
--- a/src/chatWrappers/utils/resolveChatWrapper.ts
+++ b/src/chatWrappers/utils/resolveChatWrapper.ts
@@ -164,7 +164,11 @@ export function resolveChatWrapper({
             template: modelJinjaTemplate
         };
 
-        for (const specializedChatWrapperTypeName of specializedChatWrapperTypeNames) {
+        const chatWrapperNamesToCheck = orderChatWrapperNamesByAssumedCompatibilityWithModel(
+            specializedChatWrapperTypeNames,
+            {filename, fileInfo}
+        );
+        for (const specializedChatWrapperTypeName of chatWrapperNamesToCheck) {
             const Wrapper = chatWrappers[specializedChatWrapperTypeName];
             const wrapperSettings = customWrapperSettings?.[specializedChatWrapperTypeName];
 
@@ -285,3 +289,46 @@ export function isTemplateChatWrapperType(type: string): type is TemplateChatWra
 function isClassReference<T>(value: any, classReference: T): value is T {
     return value === classReference;
 }
+
+function orderChatWrapperNamesByAssumedCompatibilityWithModel<T extends ResolvableChatWrapperTypeName>(chatWrapperNames: readonly T[], {
+    filename, fileInfo
+}: {
+    filename?: string,
+    fileInfo?: GgufFileInfo
+}): readonly T[] {
+    const rankPoints = {
+        modelName: 3,
+        modelNamePosition: 4,
+        fileName: 2,
+        fileNamePosition: 3
+    } as const;
+
+    function getPointsForTextMatch(pattern: string, fullText: string | undefined, existsPoints: number, positionPoints: number) {
+        if (fullText == null)
+            return 0;
+
+        const index = fullText.toLowerCase().indexOf(pattern.toLowerCase());
+
+        if (index >= 0)
+            return existsPoints + ((index / fullText.length) * positionPoints);
+
+        return 0;
+    }
+
+    const modelName = fileInfo?.metadata?.general?.name;
+
+    return chatWrapperNames
+        .slice()
+        .sort((a, b) => {
+            let aPoints = 0;
+            let bPoints = 0;
+
+            aPoints += getPointsForTextMatch(a, modelName, rankPoints.modelName, rankPoints.modelNamePosition);
+            bPoints += getPointsForTextMatch(b, modelName, rankPoints.modelName, rankPoints.modelNamePosition);
+
+            aPoints += getPointsForTextMatch(a, filename, rankPoints.fileName, rankPoints.fileNamePosition);
+            bPoints += getPointsForTextMatch(b, filename, rankPoints.fileName, rankPoints.fileNamePosition);
+
+            return bPoints - aPoints;
+        });
+}
diff --git a/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts b/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
index 8bb313da..51ea9963 100644
--- a/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/FunctionaryChatWrapper.test.ts
@@ -79,303 +79,672 @@ describe("FunctionaryChatWrapper", () => {
         ]
     }];
 
-    test("should generate valid context text", () => {
-        const chatWrapper = new FunctionaryChatWrapper();
-        const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
+    describe("v2.llama3", () => {
+        test("should generate valid context text", () => {
+            const chatWrapper = new FunctionaryChatWrapper({variation: "v2.llama3"});
+            const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
-        expect(contextText.values).toMatchInlineSnapshot(`
-          [
-            {
-              "type": "specialToken",
-              "value": "BOS",
-            },
-            {
-              "type": "specialTokensText",
-              "value": "<|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
-          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hi there!",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hello!",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "How are you?",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "I'm good, how are you?",
-          ]
-        `);
-
-        const chatWrapper2 = new FunctionaryChatWrapper();
-        const {contextText: contextText2} = chatWrapper2.generateContextState({
-            chatHistory: conversationHistory2,
-            availableFunctions: functions
-        });
+            expect(contextText.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
 
-        expect(contextText2.values).toMatchInlineSnapshot(`
-          [
-            {
-              "type": "specialToken",
-              "value": "BOS",
-            },
-            {
-              "type": "specialTokensText",
-              "value": "<|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "// Supported function definitions that should be called when necessary.
-          namespace functions {
+              ",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
 
-          // Get a random number
-          type getRandomNumber = (_: {min: number, max: number}) => any;
+              ",
+                },
+                "Hi there!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
 
-          } // namespace functions",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "The assistant calls functions with appropriate input when necessary. The assistant writes <|stop|> when finished answering.",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
-          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hi there!",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hello!",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "Role a dice twice and tell me the total result",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>",
-            },
-            "getRandomNumber",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|content|>",
-            },
-            "{"min":1,"max":6}",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>",
-            },
-            "getRandomNumber",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|content|>",
-            },
-            "{"min":1,"max":6}",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>",
-            },
-            "getRandomNumber",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|recipient|>all
-          <|content|>",
-            },
-            "3",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>",
-            },
-            "getRandomNumber",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|recipient|>all
-          <|content|>",
-            },
-            "4",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "The total result of rolling the dice twice is 3 + 4 = 7.",
-          ]
-        `);
-
-        const chatWrapper3 = new FunctionaryChatWrapper();
-        const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
-        const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
-            chatHistory: [
-                ...conversationHistory,
-                {
-                    type: "model",
-                    response: []
-                }
-            ]
+              ",
+                },
+                "Hello!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "How are you?",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "I'm good, how are you?",
+              ]
+            `);
+
+            const chatWrapper2 = new FunctionaryChatWrapper({variation: "v2.llama3"});
+            const {contextText: contextText2} = chatWrapper2.generateContextState({
+                chatHistory: conversationHistory2,
+                availableFunctions: functions
+            });
+
+            expect(contextText2.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
+
+              ",
+                },
+                "// Supported function definitions that should be called when necessary.
+              namespace functions {
+
+              // Get a random number
+              type getRandomNumber = (_: {min: number, max: number}) => any;
+
+              } // namespace functions",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
+
+              ",
+                },
+                "The assistant calls functions with appropriate input when necessary. The assistant writes <|stop|> when finished answering.",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
+
+              ",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "Hi there!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "Hello!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "Role a dice twice and tell me the total result",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|reserved_special_token_249|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              ",
+                },
+                "{"min":1,"max":6}",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|reserved_special_token_249|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              ",
+                },
+                "{"min":1,"max":6}",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>tool<|end_header_id|>
+
+              name=",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              ",
+                },
+                "3",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>tool<|end_header_id|>
+
+              name=",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              ",
+                },
+                "4",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "The total result of rolling the dice twice is 3 + 4 = 7.",
+              ]
+            `);
+
+            const chatWrapper3 = new FunctionaryChatWrapper({variation: "v2.llama3"});
+            const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+            const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+                chatHistory: [
+                    ...conversationHistory,
+                    {
+                        type: "model",
+                        response: []
+                    }
+                ]
+            });
+
+            expect(contextText3.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
+
+              ",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "Hi there!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "Hello!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "How are you?",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "I'm good, how are you?",
+              ]
+            `);
+
+            expect(contextText3WithOpenModelResponse.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>system<|end_header_id|>
+
+              ",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "Hi there!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "Hello!",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>user<|end_header_id|>
+
+              ",
+                },
+                "How are you?",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+                "I'm good, how are you?",
+                {
+                  "type": "specialToken",
+                  "value": "EOT",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|start_header_id|>assistant<|end_header_id|>
+
+              ",
+                },
+              ]
+            `);
         });
+    });
 
-        expect(contextText3.values).toMatchInlineSnapshot(`
-          [
-            {
-              "type": "specialToken",
-              "value": "BOS",
-            },
-            {
-              "type": "specialTokensText",
-              "value": "<|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
-          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hi there!",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hello!",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "How are you?",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "I'm good, how are you?",
-          ]
-        `);
+    describe("v2", () => {
+        test("should generate valid context text", () => {
+            const chatWrapper = new FunctionaryChatWrapper({variation: "v2"});
+            const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory});
 
-        expect(contextText3WithOpenModelResponse.values).toMatchInlineSnapshot(`
-          [
-            {
-              "type": "specialToken",
-              "value": "BOS",
-            },
-            {
-              "type": "specialTokensText",
-              "value": "<|from|>system
-          <|recipient|>all
-          <|content|>",
-            },
-            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
-          If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hi there!",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "Hello!",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>user
-          <|recipient|>all
-          <|content|>",
-            },
-            "How are you?",
-            {
-              "type": "specialTokensText",
-              "value": "
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-            "I'm good, how are you?",
-            {
-              "type": "specialTokensText",
-              "value": "<|stop|>
-          <|from|>assistant
-          <|recipient|>all
-          <|content|>",
-            },
-          ]
-        `);
+            expect(contextText.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hi there!",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hello!",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "How are you?",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "I'm good, how are you?",
+              ]
+            `);
+
+            const chatWrapper2 = new FunctionaryChatWrapper({variation: "v2"});
+            const {contextText: contextText2} = chatWrapper2.generateContextState({
+                chatHistory: conversationHistory2,
+                availableFunctions: functions
+            });
+
+            expect(contextText2.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "// Supported function definitions that should be called when necessary.
+              namespace functions {
+
+              // Get a random number
+              type getRandomNumber = (_: {min: number, max: number}) => any;
+
+              } // namespace functions",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "The assistant calls functions with appropriate input when necessary. The assistant writes <|stop|> when finished answering.",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hi there!",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hello!",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "Role a dice twice and tell me the total result",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|content|>",
+                },
+                "{"min":1,"max":6}",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|content|>",
+                },
+                "{"min":1,"max":6}",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|recipient|>all
+              <|content|>",
+                },
+                "3",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>",
+                },
+                "getRandomNumber",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|recipient|>all
+              <|content|>",
+                },
+                "4",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "The total result of rolling the dice twice is 3 + 4 = 7.",
+              ]
+            `);
+
+            const chatWrapper3 = new FunctionaryChatWrapper({variation: "v2"});
+            const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory});
+            const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({
+                chatHistory: [
+                    ...conversationHistory,
+                    {
+                        type: "model",
+                        response: []
+                    }
+                ]
+            });
+
+            expect(contextText3.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hi there!",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hello!",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "How are you?",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "I'm good, how are you?",
+              ]
+            `);
+
+            expect(contextText3WithOpenModelResponse.values).toMatchInlineSnapshot(`
+              [
+                {
+                  "type": "specialToken",
+                  "value": "BOS",
+                },
+                {
+                  "type": "specialTokensText",
+                  "value": "<|from|>system
+              <|recipient|>all
+              <|content|>",
+                },
+                "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hi there!",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "Hello!",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>user
+              <|recipient|>all
+              <|content|>",
+                },
+                "How are you?",
+                {
+                  "type": "specialTokensText",
+                  "value": "
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+                "I'm good, how are you?",
+                {
+                  "type": "specialTokensText",
+                  "value": "<|stop|>
+              <|from|>assistant
+              <|recipient|>all
+              <|content|>",
+                },
+              ]
+            `);
+        });
     });
 });
diff --git a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts
index 5072cfb4..0f46ac71 100644
--- a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts
@@ -61,7 +61,8 @@ const falconJinjaTemplate = `
 {%- endif %}
 `.slice(1, -1);
 
-const funcationaryJinjaTemplate = "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}";
+const funcationaryJinjaTemplateV2 = "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}";
+const funcationaryJinjaTemplateV2Llama3 = "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + eot_token }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + eot_token }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ eot_token }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>{role}<|end_header_id|>\n\n' }}{% endif %}";
 
 const gemmaJinjaTemplate = `
 {%- if messages[0]['role'] == 'system' %}
@@ -169,14 +170,28 @@ describe("resolveChatWrapper", () => {
         expect(chatWrapper).to.be.instanceof(FalconChatWrapper);
     });
 
-    test("should resolve to specialized FunctionaryChatWrapper", () => {
+    test("should resolve to specialized FunctionaryChatWrapper v2", () => {
         const chatWrapper = resolveChatWrapper({
             customWrapperSettings: {
                 jinjaTemplate: {
-                    template: funcationaryJinjaTemplate
+                    template: funcationaryJinjaTemplateV2
                 }
             },
-            fallbackToOtherWrappersOnJinjaError: false
+            fallbackToOtherWrappersOnJinjaError: false,
+            filename: "functionary-small-v2.2.q4_0.gguf"
+        });
+        expect(chatWrapper).to.be.instanceof(FunctionaryChatWrapper);
+    });
+
+    test("should resolve to specialized FunctionaryChatWrapper v2.llama3", () => {
+        const chatWrapper = resolveChatWrapper({
+            customWrapperSettings: {
+                jinjaTemplate: {
+                    template: funcationaryJinjaTemplateV2Llama3
+                }
+            },
+            fallbackToOtherWrappersOnJinjaError: false,
+            filename: "functionary-small-v2.5.Q4_0.gguf"
         });
         expect(chatWrapper).to.be.instanceof(FunctionaryChatWrapper);
     });
diff --git a/test/standalone/parseModelFileName.test.ts b/test/standalone/parseModelFileName.test.ts
index 3ebd132b..fda552cf 100644
--- a/test/standalone/parseModelFileName.test.ts
+++ b/test/standalone/parseModelFileName.test.ts
@@ -77,6 +77,17 @@ describe("parseModelFileName", () => {
             });
     });
 
+    test("functionary-small-v2.5.Q4_0.gguf", () => {
+        expect(parseModelFileName("functionary-small-v2.5.Q4_0.gguf"))
+            .toEqual({
+                name: "functionary",
+                subType: "small-v2.5",
+                quantization: "Q4_0",
+                fileType: "gguf",
+                otherInfo: []
+            });
+    });
+
     test("claude2-alpaca-13b.Q5_K_M.gguf", () => {
         expect(parseModelFileName("claude2-alpaca-13b.Q5_K_M.gguf"))
             .toEqual({

From ecb25e19e1e4bfc26fe223a485c5270369ffc708 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 4 Jun 2024 01:23:26 +0300
Subject: [PATCH 19/39] fix: bugs

---
 src/evaluator/LlamaChat/LlamaChat.ts | 65 +++++++++++++++-------------
 src/utils/TokenStreamRegulator.ts    |  4 ++
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 79d112b9..85326a4a 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -430,6 +430,7 @@ export class LlamaChat {
                         generateResponseState.detectAndHandleFunctionStartSyntax();
                         if (generateResponseState.functionEvaluationMode !== false) {
                             generateResponseState.canAvoidReloadingHistory = false;
+                            generateResponseState.releasePartiallyFreeTokensBeforeFunctionCallStart();
                             const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(
                                 loadContextWindowForFunctionCallingLoop
                             );
@@ -1217,6 +1218,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private functionEvaluationFunctionName: string = "";
     private currentFunctionCallPreviousPartLeftoverText: string = "";
     private removedStartTextToIgnore: boolean = false;
+    private releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax: boolean = false;
 
     public generatedTokens = 0;
     public isFirstEvaluation = true;
@@ -1746,6 +1748,8 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             } else if (this.functionEvaluationMode === "functionName") {
                 const functionNameGenerationDoneDetector = new StopGenerationDetector();
 
+                this.stopGenerationDetector.clearInProgressStops();
+                this.customStopGenerationTriggersDetector.clearInProgressStops();
                 this.currentFunctionCallPreviousText = LlamaText(this.chatWrapper.settings.functions.call.prefix);
                 this.currentFunctionCallCurrentPartTokens.length = 0;
                 const functionNameGrammar = this.functionNameGrammar ?? new FunctionCallNameGrammar(
@@ -1791,10 +1795,6 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 }
 
                 for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
-                    const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
-                    if (stopGenerationTriggerRes != null)
-                        return stopGenerationTriggerRes;
-
                     this.currentFunctionCallCurrentPartTokens.push(token);
 
                     functionNameGenerationDoneDetector.recordGeneration({
@@ -1852,10 +1852,6 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                         .map((stopTrigger) => functionParamsGenerationDoneDetector.addStopTrigger(stopTrigger));
 
                     for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
-                        const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
-                        if (stopGenerationTriggerRes != null)
-                            return stopGenerationTriggerRes;
-
                         this.currentFunctionCallCurrentPartTokens.push(token);
 
                         functionParamsGenerationDoneDetector.recordGeneration({
@@ -1923,10 +1919,6 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                     .map((stopTrigger) => sectionSuffixDetector.addStopTrigger(stopTrigger));
 
                 for await (const token of this.evaluateWithContextShift(loadContextWindow)) {
-                    const stopGenerationTriggerRes = this.handleStopGenerationTrigger("model");
-                    if (stopGenerationTriggerRes != null)
-                        return stopGenerationTriggerRes;
-
                     this.currentFunctionCallCurrentPartTokens.push(token);
 
                     sectionSuffixDetector.recordGeneration({
@@ -1962,29 +1954,40 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return undefined;
     }
 
-    public returnFunctionCallResults(): LlamaChatResponse<Functions> | undefined {
-        if (this.resFunctionCalls.length > 0) {
-            this.stopGenerationDetector.clearInProgressStops();
-            this.customStopGenerationTriggersDetector.clearInProgressStops();
-            this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
+    public releasePartiallyFreeTokensBeforeFunctionCallStart() {
+        if (this.releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax)
+            return;
 
-            const triggeredStops = this.functionSyntaxStartDetector.getTriggeredStops();
-            const partiallyFreeTokens = this.streamRegulator.getPartiallyFreeChunk(this.llamaChat.model.tokenizer);
-            const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
-                triggeredStops,
-                partiallyFreeTokens,
-                this.llamaChat.model.tokenizer
-            );
-            this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
+        this.stopGenerationDetector.clearInProgressStops();
+        this.customStopGenerationTriggersDetector.clearInProgressStops();
+        this.pendingTokens.push(...this.streamRegulator.popFreeChunkTokens());
 
-            this.removeFoundStartIgnoreTextsFromPendingTokens(true);
+        const triggeredStops = this.functionSyntaxStartDetector.getTriggeredStops();
+        const partiallyFreeTokens = this.streamRegulator.getPartiallyFreeChunk(this.llamaChat.model.tokenizer);
+        const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
+            triggeredStops,
+            partiallyFreeTokens,
+            this.llamaChat.model.tokenizer
+        );
+        this.pendingTokens.push(...queuedTokensBeforeStopTrigger);
 
-            if (this.pendingTokens.length > 0)
-                this.onToken?.(this.pendingTokens.slice());
+        this.removeFoundStartIgnoreTextsFromPendingTokens(true);
 
-            this.res.push(...this.pendingTokens);
-            this.contextWindowsRes.push(...this.pendingTokens);
-            this.pendingTokens.length = 0;
+        if (this.pendingTokens.length > 0)
+            this.onToken?.(this.pendingTokens.slice());
+
+        this.res.push(...this.pendingTokens);
+        this.contextWindowsRes.push(...this.pendingTokens);
+        this.pendingTokens.length = 0;
+
+        this.streamRegulator.clearQueue();
+
+        this.releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax = true;
+    }
+
+    public returnFunctionCallResults(): LlamaChatResponse<Functions> | undefined {
+        if (this.resFunctionCalls.length > 0) {
+            this.releasePartiallyFreeTokensBeforeFunctionCallStart();
 
             let modelResponse = this.llamaChat.model.detokenize(this.res);
             let contextWindowModelResponse = this.llamaChat.model.detokenize(this.contextWindowsRes);
diff --git a/src/utils/TokenStreamRegulator.ts b/src/utils/TokenStreamRegulator.ts
index 59579807..00b8118c 100644
--- a/src/utils/TokenStreamRegulator.ts
+++ b/src/utils/TokenStreamRegulator.ts
@@ -89,6 +89,10 @@ export class TokenStreamRegulator {
     public getAllQueuedChunkTokens() {
         return this._queue.flatMap((queuedRelease) => queuedRelease.tokens);
     }
+
+    public clearQueue() {
+        this._queue.length = 0;
+    }
 }
 
 export class QueuedTokenRelease {

From c8ba7eb64645469af006e52673458faf9986bd4f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 4 Jun 2024 01:34:39 +0300
Subject: [PATCH 20/39] test: update models used in tests

---
 .../functionary/chatSession.test.ts           |  30 +-
 .../functionary/embedding.test.ts             |   4 +-
 .../functionaryModelGpuLayersOptions.test.ts  | 118 +++----
 .../functionary/functions.test.ts             |  52 +++-
 .../__snapshots__/ggufParser.test.ts.snap     | 294 +++++++++---------
 .../functionary/gguf/ggufInsights.test.ts     |  60 ++--
 .../functionary/gguf/ggufParser.test.ts       |   2 +-
 .../functionary/grammar.test.ts               |  10 +-
 .../modelDependent/functionary/sanity.test.ts | 109 ++++---
 .../stableCode/asyncContextLoad.test.ts       |   2 +-
 .../stableCode/asyncModelLoad.test.ts         |   6 +-
 .../stableCode/completion.test.ts             |   8 +-
 .../stableCode/parallel.test.ts               |   8 +-
 .../stableCodeModelGpuLayersOptions.test.ts   |   2 +-
 test/utils/modelFiles.ts                      |   4 +-
 15 files changed, 377 insertions(+), 332 deletions(-)

diff --git a/test/modelDependent/functionary/chatSession.test.ts b/test/modelDependent/functionary/chatSession.test.ts
index 7aa033ee..536e938d 100644
--- a/test/modelDependent/functionary/chatSession.test.ts
+++ b/test/modelDependent/functionary/chatSession.test.ts
@@ -6,7 +6,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("functionary", () => {
     describe("chat session", () => {
         test("restore chat history", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -23,7 +23,7 @@ describe("functionary", () => {
 
             const res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("The sum of 6 and 6 is 12.");
+            expect(res).to.eql("6 + 6 = 12.");
 
             const chatHistory = chatSession.getChatHistory();
 
@@ -35,11 +35,11 @@ describe("functionary", () => {
 
             const res2 = await chatSession2.prompt("Repeat your answer");
 
-            expect(res2).to.eql("The sum of 6 and 6 is 12.");
+            expect(res2).to.eql("6 + 6 = 12.");
         });
 
         test("disposing a context sequences removes the current state", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -58,12 +58,12 @@ describe("functionary", () => {
 
             const res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("The sum of 6 and 6 is 12.");
+            expect(res).to.eql("6 + 6 = 12.");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 96,
-                "usedOutputTokens": 14,
+                "usedInputTokens": 80,
+                "usedOutputTokens": 9,
                 "usedRestoreStateTokens": 0,
               }
             `);
@@ -81,8 +81,8 @@ describe("functionary", () => {
             const tokenMeterState2 = contextSequence2.tokenMeter.getState();
             expect(tokenMeterState2).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 98,
-                "usedOutputTokens": 15,
+                "usedInputTokens": 82,
+                "usedOutputTokens": 14,
                 "usedRestoreStateTokens": 0,
               }
             `);
@@ -91,7 +91,7 @@ describe("functionary", () => {
         });
 
         test("reusing a context sequences utilizes existing state", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -110,12 +110,12 @@ describe("functionary", () => {
 
             const res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("The sum of 6 and 6 is 12.");
+            expect(res).to.eql("6 + 6 = 12.");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 96,
-                "usedOutputTokens": 14,
+                "usedInputTokens": 80,
+                "usedOutputTokens": 9,
                 "usedRestoreStateTokens": 0,
               }
             `);
@@ -130,8 +130,8 @@ describe("functionary", () => {
             const tokenMeterStateDiff = contextSequence.tokenMeter.diff(tokenMeterState);
             expect(tokenMeterStateDiff).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 10,
-                "usedOutputTokens": 15,
+                "usedInputTokens": 6,
+                "usedOutputTokens": 14,
                 "usedRestoreStateTokens": 0,
               }
             `);
diff --git a/test/modelDependent/functionary/embedding.test.ts b/test/modelDependent/functionary/embedding.test.ts
index aad52cc0..006a3747 100644
--- a/test/modelDependent/functionary/embedding.test.ts
+++ b/test/modelDependent/functionary/embedding.test.ts
@@ -5,7 +5,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("functionary", () => {
     describe("embedding", () => {
         test("deterministic", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -28,7 +28,7 @@ describe("functionary", () => {
         });
 
         test("deterministic between runs", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 32117306..08b94300 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -9,7 +9,7 @@ import {BuildGpu} from "../../../src/bindings/types.js";
 describe("functionary", () => {
     describe("model options", () => {
         describe("Resolve the correct number of GPU layers", async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const fileInfo = await readGgufFileInfo(modelPath);
@@ -70,7 +70,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers(0, {
@@ -78,7 +78,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 0
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
 
                 {
@@ -88,7 +88,7 @@ describe("functionary", () => {
                         llamaGpu: false
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
             });
 
@@ -99,7 +99,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 3
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7415");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1924");
                 }
                 try {
                     resolveGpuLayers(16, {
@@ -141,7 +141,7 @@ describe("functionary", () => {
                         llamaGpu: false
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers(16, {
@@ -151,7 +151,7 @@ describe("functionary", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
             });
 
@@ -162,7 +162,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11260");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
                 }
                 try {
                     resolveGpuLayers(32, {
@@ -190,7 +190,7 @@ describe("functionary", () => {
                         llamaGpu: false
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers(32, {
@@ -200,7 +200,7 @@ describe("functionary", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
             });
 
@@ -211,7 +211,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11260");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
                 }
                 try {
                     resolveGpuLayers(33, {
@@ -239,7 +239,7 @@ describe("functionary", () => {
                         llamaGpu: false
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers(33, {
@@ -249,7 +249,7 @@ describe("functionary", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
             });
 
@@ -325,7 +325,7 @@ describe("functionary", () => {
                         freeVram: 0
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
@@ -333,95 +333,95 @@ describe("functionary", () => {
                         freeVram: s1GB * 0.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6522");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 1.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8799");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 2.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8472");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 3.1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8209");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 3.3
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8628");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 3.5
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9024");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 3.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9042");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8386");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4.3
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("24");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8434");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4.5
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3235");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8076");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4840");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8140");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
@@ -429,7 +429,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 5.2
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6980");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3282");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
@@ -437,7 +437,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 5.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10190");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6492");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
@@ -445,7 +445,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11260");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
                 }
             });
 
@@ -456,7 +456,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 0
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers({min: 0, max: 4}, {
@@ -464,7 +464,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 0
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 try {
                     resolveGpuLayers({min: 2}, {
@@ -491,7 +491,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("15358");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 try {
                     resolveGpuLayers({min: 16}, {
@@ -508,8 +508,8 @@ describe("functionary", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.be.gte(16);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8386");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers({min: 16, max: 24}, {
@@ -518,8 +518,8 @@ describe("functionary", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8386");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = resolveGpuLayers({min: 16, max: 24}, {
@@ -529,7 +529,7 @@ describe("functionary", () => {
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7415");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1924");
                 }
             });
 
@@ -542,7 +542,7 @@ describe("functionary", () => {
                         llamaGpu: false
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -551,8 +551,8 @@ describe("functionary", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5647");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5561");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -561,8 +561,8 @@ describe("functionary", () => {
                         totalVram: s1GB * 2,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("3");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5495");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -571,8 +571,8 @@ describe("functionary", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9395");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -581,8 +581,8 @@ describe("functionary", () => {
                         totalVram: s1GB * 1,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9434");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -592,7 +592,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 0
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -613,7 +613,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 0
                     });
                     expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("32768");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
             });
diff --git a/test/modelDependent/functionary/functions.test.ts b/test/modelDependent/functionary/functions.test.ts
index 9182f058..fd59f13b 100644
--- a/test/modelDependent/functionary/functions.test.ts
+++ b/test/modelDependent/functionary/functions.test.ts
@@ -6,7 +6,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("functionary", () => {
     describe("functions", () => {
         test("get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -51,7 +51,7 @@ describe("functionary", () => {
         });
 
         test("async get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -94,11 +94,57 @@ describe("functionary", () => {
 
             expect(res2.length).to.be.greaterThan(1);
         });
+
+        test("async get n-th word twice", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
+                functions: {
+                    getNthWord: defineChatSessionFunction({
+                        description: "Get an n-th word",
+                        params: {
+                            type: "object",
+                            properties: {
+                                n: {
+                                    enum: [1, 2, 3, 4]
+                                }
+                            }
+                        },
+                        async handler(params) {
+                            return ["very", "secret", "this", "hello"][params.n - 1];
+                        }
+                    })
+                }
+            } as const;
+
+            const res = await chatSession.prompt("what are the first and second words?", promptOptions);
+
+            expect(res).to.be.eq('The first word is "very" and the second word is "secret".');
+
+            const res2 = await chatSession.prompt("Explain what these words mean", {
+                ...promptOptions,
+                maxTokens: 40
+            });
+
+            expect(res2.length).to.be.greaterThan(1);
+        });
     });
 
     describe("functions and grammar", () => {
         test("get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
index 57274459..f21a39e4 100644
--- a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
+++ b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
@@ -9,19 +9,20 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       "layer_norm_rms_epsilon": 0.000009999999747378752,
     },
     "block_count": 32,
-    "context_length": 32768,
+    "context_length": 8192,
     "embedding_length": 4096,
     "feed_forward_length": 14336,
     "rope": {
       "dimension_count": 128,
-      "freq_base": 10000,
+      "freq_base": 500000,
     },
+    "vocab_size": 128256,
   },
   "fullTensorInfo": [
     {
       "dimensions": [
         4096,
-        32004,
+        128256,
       ],
       "ggmlType": 2,
       "name": "token_embd.weight",
@@ -33,7 +34,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
-      "offset": 73737216,
+      "offset": 295501824,
     },
     {
       "dimensions": [
@@ -42,7 +43,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
-      "offset": 73753600,
+      "offset": 295518208,
     },
     {
       "dimensions": [
@@ -51,14 +52,14 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
-      "offset": 106783744,
+      "offset": 328548352,
     },
   ],
   "metadata": {
     "general": {
       "architecture": "llama",
       "file_type": 2,
-      "name": "workspace",
+      "name": "llama3-functionary-hf",
       "quantization_version": 2,
     },
     "llama": {
@@ -68,100 +69,92 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         "layer_norm_rms_epsilon": 0.000009999999747378752,
       },
       "block_count": 32,
-      "context_length": 32768,
+      "context_length": 8192,
       "embedding_length": 4096,
       "feed_forward_length": 14336,
       "rope": {
         "dimension_count": 128,
-        "freq_base": 10000,
+        "freq_base": 500000,
       },
+      "vocab_size": 128256,
     },
     "tokenizer": {
       "chat_template": "{% for message in messages %}
 {% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+' + 'name=' + message['name'] + '
+' + message['content'] + '<|eot_id|>' }}{% else %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'}}{% if message['content'] is not none %}
+{{ message['content'] }}{% endif %}
 {% if 'tool_calls' in message and message['tool_calls'] is not none %}
 {% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
+{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '
+' + tool_call['function']['arguments'] }}{% endfor %}
 {% endif %}
-{{ '<|stop|>
-' }}{% endif %}
+{{ '<|eot_id|>' }}{% endif %}
 {% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}",
+{% if add_generation_prompt %}{{ '<|start_header_id|>{role}<|end_header_id|>
+
+' }}{% endif %}",
       "ggml": {
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "model": "llama",
-        "padding_token_id": 2,
-        "scores": [
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
+        "bos_token_id": 128000,
+        "eos_token_id": 128001,
+        "merges": [
+          "Ä  Ä ",
+          "Ä  Ä Ä Ä ",
+          "Ä Ä  Ä Ä ",
+          "Ä Ä Ä  Ä ",
+          "i n",
+          "Ä  t",
+          "Ä  Ä Ä Ä Ä Ä Ä Ä ",
+          "Ä Ä  Ä Ä Ä Ä Ä Ä ",
+          "Ä Ä Ä Ä  Ä Ä Ä Ä ",
+          "Ä Ä Ä  Ä Ä Ä Ä Ä ",
         ],
+        "model": "gpt2",
+        "padding_token_id": 128001,
+        "pre": "llama-bpe",
         "token_type": [
-          2,
-          3,
-          3,
-          6,
-          6,
-          6,
-          6,
-          6,
-          6,
-          6,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
         ],
         "tokens": [
-          "<unk>",
-          "<s>",
-          "</s>",
-          "<0x00>",
-          "<0x01>",
-          "<0x02>",
-          "<0x03>",
-          "<0x04>",
-          "<0x05>",
-          "<0x06>",
+          "!",
+          """,
+          "#",
+          "$",
+          "%",
+          "&",
+          "'",
+          "(",
+          ")",
+          "*",
         ],
-        "unknown_token_id": 0,
       },
     },
   },
-  "metadataSize": 718762,
+  "metadataSize": 7819208,
   "splicedParts": 1,
   "tensorCount": 291,
   "tensorInfo": [
     {
       "dimensions": [
         4096,
-        32004,
+        128256,
       ],
       "ggmlType": 2,
       "name": "token_embd.weight",
@@ -173,7 +166,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
-      "offset": 73737216,
+      "offset": 295501824,
     },
     {
       "dimensions": [
@@ -182,7 +175,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
-      "offset": 73753600,
+      "offset": 295518208,
     },
     {
       "dimensions": [
@@ -191,11 +184,11 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
-      "offset": 106783744,
+      "offset": 328548352,
     },
   ],
   "tensorInfoSize": 17286,
-  "totalMetadataSize": 718762,
+  "totalMetadataSize": 7819208,
   "totalTensorCount": 291,
   "totalTensorInfoSize": 17286,
   "version": 3,
@@ -211,19 +204,20 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       "layer_norm_rms_epsilon": 0.000009999999747378752,
     },
     "block_count": 32,
-    "context_length": 32768,
+    "context_length": 8192,
     "embedding_length": 4096,
     "feed_forward_length": 14336,
     "rope": {
       "dimension_count": 128,
-      "freq_base": 10000,
+      "freq_base": 500000,
     },
+    "vocab_size": 128256,
   },
   "fullTensorInfo": [
     {
       "dimensions": [
         4096,
-        32004,
+        128256,
       ],
       "ggmlType": 2,
       "name": "token_embd.weight",
@@ -235,7 +229,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
-      "offset": 73737216,
+      "offset": 295501824,
     },
     {
       "dimensions": [
@@ -244,7 +238,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
-      "offset": 73753600,
+      "offset": 295518208,
     },
     {
       "dimensions": [
@@ -253,14 +247,14 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
-      "offset": 106783744,
+      "offset": 328548352,
     },
   ],
   "metadata": {
     "general": {
       "architecture": "llama",
       "file_type": 2,
-      "name": "workspace",
+      "name": "llama3-functionary-hf",
       "quantization_version": 2,
     },
     "llama": {
@@ -270,100 +264,92 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         "layer_norm_rms_epsilon": 0.000009999999747378752,
       },
       "block_count": 32,
-      "context_length": 32768,
+      "context_length": 8192,
       "embedding_length": 4096,
       "feed_forward_length": 14336,
       "rope": {
         "dimension_count": 128,
-        "freq_base": 10000,
+        "freq_base": 500000,
       },
+      "vocab_size": 128256,
     },
     "tokenizer": {
       "chat_template": "{% for message in messages %}
 {% if message['role'] == 'user' or message['role'] == 'system' %}
-{{ '<|from|>' + message['role'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% elif message['role'] == 'tool' %}
-{{ '<|from|>' + message['name'] + '
-<|recipient|>all
-<|content|>' + message['content'] + '
-' }}{% else %}
-{% set contain_content='no'%}
-{% if message['content'] is not none %}
-{{ '<|from|>assistant
-<|recipient|>all
-<|content|>' + message['content'] }}{% set contain_content='yes'%}
-{% endif %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+' + 'name=' + message['name'] + '
+' + message['content'] + '<|eot_id|>' }}{% else %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'}}{% if message['content'] is not none %}
+{{ message['content'] }}{% endif %}
 {% if 'tool_calls' in message and message['tool_calls'] is not none %}
 {% for tool_call in message['tool_calls'] %}
-{% set prompt='<|from|>assistant
-<|recipient|>' + tool_call['function']['name'] + '
-<|content|>' + tool_call['function']['arguments'] %}
-{% if loop.index == 1 and contain_content == "no" %}
-{{ prompt }}{% else %}
-{{ '
-' + prompt}}{% endif %}
-{% endfor %}
+{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '
+' + tool_call['function']['arguments'] }}{% endfor %}
 {% endif %}
-{{ '<|stop|>
-' }}{% endif %}
+{{ '<|eot_id|>' }}{% endif %}
 {% endfor %}
-{% if add_generation_prompt %}{{ '<|from|>assistant
-<|recipient|>' }}{% endif %}",
+{% if add_generation_prompt %}{{ '<|start_header_id|>{role}<|end_header_id|>
+
+' }}{% endif %}",
       "ggml": {
-        "bos_token_id": 1,
-        "eos_token_id": 2,
-        "model": "llama",
-        "padding_token_id": 2,
-        "scores": [
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
-          0,
+        "bos_token_id": 128000,
+        "eos_token_id": 128001,
+        "merges": [
+          "Ä  Ä ",
+          "Ä  Ä Ä Ä ",
+          "Ä Ä  Ä Ä ",
+          "Ä Ä Ä  Ä ",
+          "i n",
+          "Ä  t",
+          "Ä  Ä Ä Ä Ä Ä Ä Ä ",
+          "Ä Ä  Ä Ä Ä Ä Ä Ä ",
+          "Ä Ä Ä Ä  Ä Ä Ä Ä ",
+          "Ä Ä Ä  Ä Ä Ä Ä Ä ",
         ],
+        "model": "gpt2",
+        "padding_token_id": 128001,
+        "pre": "llama-bpe",
         "token_type": [
-          2,
-          3,
-          3,
-          6,
-          6,
-          6,
-          6,
-          6,
-          6,
-          6,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
+          1,
         ],
         "tokens": [
-          "<unk>",
-          "<s>",
-          "</s>",
-          "<0x00>",
-          "<0x01>",
-          "<0x02>",
-          "<0x03>",
-          "<0x04>",
-          "<0x05>",
-          "<0x06>",
+          "!",
+          """,
+          "#",
+          "$",
+          "%",
+          "&",
+          "'",
+          "(",
+          ")",
+          "*",
         ],
-        "unknown_token_id": 0,
       },
     },
   },
-  "metadataSize": 718762,
+  "metadataSize": 7819208,
   "splicedParts": 1,
   "tensorCount": 291,
   "tensorInfo": [
     {
       "dimensions": [
         4096,
-        32004,
+        128256,
       ],
       "ggmlType": 2,
       "name": "token_embd.weight",
@@ -375,7 +361,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
-      "offset": 73737216,
+      "offset": 295501824,
     },
     {
       "dimensions": [
@@ -384,7 +370,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
-      "offset": 73753600,
+      "offset": 295518208,
     },
     {
       "dimensions": [
@@ -393,11 +379,11 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       ],
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
-      "offset": 106783744,
+      "offset": 328548352,
     },
   ],
   "tensorInfoSize": 17286,
-  "totalMetadataSize": 718762,
+  "totalMetadataSize": 7819208,
   "totalTensorCount": 291,
   "totalTensorInfoSize": 17286,
   "version": 3,
diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
index 2bde7c9c..1162d25d 100644
--- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
@@ -7,7 +7,7 @@ import {readGgufFileInfo} from "../../../../src/gguf/readGgufFileInfo.js";
 
 describe("gguf", async () => {
     describe("insights", async () => {
-        const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+        const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
 
         test("determine the number of layers from the tensor info", async () => {
             const llama = await getTestLlama();
@@ -22,7 +22,7 @@ describe("gguf", async () => {
             const ggufMetadataParseResult = await readGgufFileInfo(modelPath);
 
             const ggufInsights = await GgufInsights.from(ggufMetadataParseResult, llama);
-            expect(ggufInsights.modelSize).toMatchInlineSnapshot("4108204160");
+            expect(ggufInsights.modelSize).toMatchInlineSnapshot("4653375488");
         });
 
         test("estimated model memory footprint stays the same", async () => {
@@ -32,44 +32,44 @@ describe("gguf", async () => {
             const ggufInsights = await GgufInsights.from(ggufMetadataParseResult, llama);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 0}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "3.83GB",
+                "cpuRam": "4.33GB",
                 "gpuVram": "0B",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 1}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "3.54GB",
-                "gpuVram": "289.92MB",
+                "gpuVram": "809.84MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 8}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "2.74GB",
-                "gpuVram": "1.08GB",
+                "gpuVram": "1.59GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "1.83GB",
-                "gpuVram": "2GB",
+                "gpuVram": "2.51GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 24}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "936.25MB",
-                "gpuVram": "2.91GB",
+                "gpuVram": "3.42GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 32}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "3.83GB",
+                "gpuVram": "4.33GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 33}))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "3.83GB",
+                "gpuVram": "4.33GB",
               }
             `);
         });
@@ -96,7 +96,7 @@ describe("gguf", async () => {
             const s5MB = 5 * Math.pow(1024, 2);
 
             const estimatedModelVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: ggufInsights.totalLayers}).gpuVram;
-            expect(bytes(estimatedModelVramUsage)).toMatchInlineSnapshot('"3.83GB"');
+            expect(bytes(estimatedModelVramUsage)).toMatchInlineSnapshot('"4.33GB"');
             expect(Math.abs(modelVramUsageDiff - estimatedModelVramUsage)).to.be.lte(s100MB);
 
             const modelEstimationDiffWithActual = estimatedModelVramUsage - model.size;
@@ -165,7 +165,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1.6GB",
+                "cpuRam": "1.78GB",
                 "gpuVram": "0B",
               }
             `);
@@ -176,7 +176,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "854.63MB",
+                "cpuRam": "1.02GB",
                 "gpuVram": "0B",
               }
             `);
@@ -187,7 +187,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "462.6MB",
+                "cpuRam": "650.6MB",
                 "gpuVram": "0B",
               }
             `);
@@ -198,7 +198,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "266.59MB",
+                "cpuRam": "454.58MB",
                 "gpuVram": "0B",
               }
             `);
@@ -211,7 +211,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "1GB",
-                "gpuVram": "646.7MB",
+                "gpuVram": "834.69MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -222,7 +222,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "512MB",
-                "gpuVram": "358.64MB",
+                "gpuVram": "546.63MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -233,7 +233,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "256MB",
-                "gpuVram": "214.61MB",
+                "gpuVram": "402.6MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -244,7 +244,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "128MB",
-                "gpuVram": "142.59MB",
+                "gpuVram": "330.58MB",
               }
             `);
 
@@ -256,7 +256,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "544MB",
-                "gpuVram": "1.1GB",
+                "gpuVram": "1.28GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -267,7 +267,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "272MB",
-                "gpuVram": "598.68MB",
+                "gpuVram": "786.68MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -278,7 +278,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "136MB",
-                "gpuVram": "334.65MB",
+                "gpuVram": "522.64MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -289,7 +289,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "68MB",
-                "gpuVram": "202.64MB",
+                "gpuVram": "390.63MB",
               }
             `);
 
@@ -301,7 +301,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "32MB",
-                "gpuVram": "1.6GB",
+                "gpuVram": "1.78GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -312,7 +312,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "16MB",
-                "gpuVram": "854.73MB",
+                "gpuVram": "1.02GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -323,7 +323,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "8MB",
-                "gpuVram": "462.7MB",
+                "gpuVram": "650.69MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -334,7 +334,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "4MB",
-                "gpuVram": "266.69MB",
+                "gpuVram": "454.68MB",
               }
             `);
 
@@ -346,7 +346,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "1.6GB",
+                "gpuVram": "1.78GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -357,7 +357,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "854.74MB",
+                "gpuVram": "1.02GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -368,7 +368,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "462.7MB",
+                "gpuVram": "650.7MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -379,7 +379,7 @@ describe("gguf", async () => {
             }))).toMatchInlineSnapshot(`
               {
                 "cpuRam": "0B",
-                "gpuVram": "266.69MB",
+                "gpuVram": "454.68MB",
               }
             `);
         });
diff --git a/test/modelDependent/functionary/gguf/ggufParser.test.ts b/test/modelDependent/functionary/gguf/ggufParser.test.ts
index 42fb7dbf..ea729f3c 100644
--- a/test/modelDependent/functionary/gguf/ggufParser.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufParser.test.ts
@@ -7,7 +7,7 @@ import {simplifyGgufInfoForTestSnapshot} from "../../../utils/helpers/simplifyGg
 
 describe("gguf", async () => {
     describe("parser", async () => {
-        const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+        const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
 
         test("Magic should be GGUF local model", async () => {
             const fileReader = new GgufFsFileReader({filePath: modelPath});
diff --git a/test/modelDependent/functionary/grammar.test.ts b/test/modelDependent/functionary/grammar.test.ts
index 265be2a6..7f175bd2 100644
--- a/test/modelDependent/functionary/grammar.test.ts
+++ b/test/modelDependent/functionary/grammar.test.ts
@@ -7,7 +7,7 @@ describe("functionary", () => {
     describe("grammar", () => {
         describe("JSON schema", () => {
             test("find verb in message", {timeout: 1000 * 60 * 60 * 2}, async () => {
-                const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+                const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
                 const llama = await getTestLlama();
 
                 const model = await llama.loadModel({
@@ -26,7 +26,7 @@ describe("functionary", () => {
                         "userMessagePositivityScoreFromOneToTen": {
                             enum: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                         },
-                        "verbsInUserMessage": {
+                        "positiveWordsInUserMessage": {
                             type: "array",
                             items: {
                                 type: "string"
@@ -35,17 +35,17 @@ describe("functionary", () => {
                     }
                 } as const);
 
-                const res = await chatSession.prompt("How's your great day going so far?", {
+                const res = await chatSession.prompt("It's great!", {
                     grammar
                 });
                 const parsedRes = grammar.parse(res);
 
                 expect(parsedRes.userMessagePositivityScoreFromOneToTen).to.eq(10);
-                expect(parsedRes.verbsInUserMessage).to.eql(["going"]);
+                expect(parsedRes.positiveWordsInUserMessage).to.eql(["great"]);
             });
 
             test("get an array of numbers", {timeout: 1000 * 60 * 60 * 2}, async () => {
-                const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+                const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
                 const llama = await getTestLlama();
 
                 const model = await llama.loadModel({
diff --git a/test/modelDependent/functionary/sanity.test.ts b/test/modelDependent/functionary/sanity.test.ts
index a9c73ba0..1b0ba28c 100644
--- a/test/modelDependent/functionary/sanity.test.ts
+++ b/test/modelDependent/functionary/sanity.test.ts
@@ -6,7 +6,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("functionary", () => {
     describe("sanity", () => {
         test("How much is 6+6", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -20,18 +20,18 @@ describe("functionary", () => {
 
             const res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("The sum of 6 and 6 is 12.");
+            expect(res).to.eql("6 + 6 = 12.");
         });
 
         test("text is tokenized with special tokens when appropriate", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("functionary-small-v2.2.q4_0.gguf");
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
                 modelPath
             });
 
-            const text = "<|from|>system\n<|recipient|>all\n<|content|>How much is 6+6\n";
+            const text = "<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
 
             const tokensWithSpecialTokens = model.tokenize(text, true);
             const tokensWithoutSpecialTokens = model.tokenize(text);
@@ -40,56 +40,69 @@ describe("functionary", () => {
 
             expect(tokensWithSpecialTokens).to.toMatchInlineSnapshot(`
               [
-                32002,
-                6574,
-                13,
-                32001,
-                455,
-                13,
-                32000,
-                5660,
-                1188,
-                349,
-                28705,
-                28784,
-                28806,
-                28784,
-                13,
+                128006,
+                9125,
+                128007,
+                271,
+                4438,
+                1790,
+                374,
+                220,
+                21,
+                10,
+                21,
+                198,
               ]
             `);
             expect(tokensWithoutSpecialTokens).to.toMatchInlineSnapshot(`
               [
-                523,
-                28766,
-                3211,
-                28766,
-                28767,
-                6574,
-                13,
-                28789,
-                28766,
-                3354,
-                508,
-                722,
-                28766,
-                28767,
-                455,
-                13,
-                28789,
-                28766,
-                3789,
-                28766,
-                28767,
-                5660,
-                1188,
-                349,
-                28705,
-                28784,
-                28806,
-                28784,
-                13,
+                27,
+                91,
+                2527,
+                8932,
+                851,
+                91,
+                29,
+                9125,
+                27,
+                91,
+                408,
+                8932,
+                851,
+                91,
+                1363,
+                4438,
+                1790,
+                374,
+                220,
+                21,
+                10,
+                21,
+                198,
               ]
             `);
         });
+
+        test("tokenizing text and then detokenizing it arrive at the same text", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+
+            const text = "<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+            const tokensWithSpecialTokens = model.tokenize(text, true);
+            const tokensNoSpecialTokens = model.tokenize(text, false);
+
+            expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+
+            const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+            const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+            expect(textWithSpecialTokens).to.eql(text);
+            expect(textNoSpecialTokens).to.eql(text);
+        });
     });
 });
diff --git a/test/modelDependent/stableCode/asyncContextLoad.test.ts b/test/modelDependent/stableCode/asyncContextLoad.test.ts
index 065c2d10..2d3473b5 100644
--- a/test/modelDependent/stableCode/asyncContextLoad.test.ts
+++ b/test/modelDependent/stableCode/asyncContextLoad.test.ts
@@ -5,7 +5,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("stableCode", () => {
     describe("async context load", () => {
         test("load asynchronously", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
             const model = await llama.loadModel({
                 modelPath
diff --git a/test/modelDependent/stableCode/asyncModelLoad.test.ts b/test/modelDependent/stableCode/asyncModelLoad.test.ts
index cd882b13..3769b438 100644
--- a/test/modelDependent/stableCode/asyncModelLoad.test.ts
+++ b/test/modelDependent/stableCode/asyncModelLoad.test.ts
@@ -5,7 +5,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("stableCode", () => {
     describe("async model load", () => {
         test("load asynchronously", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             let loopIterationsBeforeLoad = 0;
@@ -52,7 +52,7 @@ describe("stableCode", () => {
         });
 
         test("load progress emitted", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             let loopIterationsBeforeLoad = 0;
@@ -89,7 +89,7 @@ describe("stableCode", () => {
         });
 
         test("abort model load works", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             class CustomError extends Error {}
diff --git a/test/modelDependent/stableCode/completion.test.ts b/test/modelDependent/stableCode/completion.test.ts
index af2b995d..17aa2304 100644
--- a/test/modelDependent/stableCode/completion.test.ts
+++ b/test/modelDependent/stableCode/completion.test.ts
@@ -6,7 +6,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("stableCode", () => {
     describe("completion", () => {
         test("complete a series", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -27,7 +27,7 @@ describe("stableCode", () => {
         });
 
         test("complete pretictable text", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -50,7 +50,7 @@ describe("stableCode", () => {
 
     describe("infill", () => {
         test("fill the gap in a series", {timeout: 1000 * 60 * 60 * 2, retry: 4}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -70,7 +70,7 @@ describe("stableCode", () => {
         });
 
         test("fill expected text", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
diff --git a/test/modelDependent/stableCode/parallel.test.ts b/test/modelDependent/stableCode/parallel.test.ts
index 8a0c0ca4..d4cdd09f 100644
--- a/test/modelDependent/stableCode/parallel.test.ts
+++ b/test/modelDependent/stableCode/parallel.test.ts
@@ -6,7 +6,7 @@ import {createTestLlama, getTestLlama} from "../../utils/getTestLlama.js";
 describe("stableCode", () => {
     describe("parallel", () => {
         test("can use multiple bindings in parallel", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
             const llama2 = await createTestLlama();
 
@@ -56,7 +56,7 @@ describe("stableCode", () => {
         });
 
         test("can use multiple models in parallel", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -100,7 +100,7 @@ describe("stableCode", () => {
         });
 
         test("can use multiple contexts in parallel", {timeout: 1000 * 60 * 60 * 2}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
@@ -141,7 +141,7 @@ describe("stableCode", () => {
         });
 
         test("can use multiple context sequences in parallel", {timeout: 1000 * 60 * 60 * 2, retry: 4}, async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const model = await llama.loadModel({
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index 2b6b81e0..de9f1d3c 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -9,7 +9,7 @@ import {defaultLlamaVramPadding} from "../../../src/bindings/getLlama.js";
 describe("stableCode", () => {
     describe("model options", () => {
         describe("Resolve the correct number of GPU layers", async () => {
-            const modelPath = await getModelFile("stable-code-3b.Q5_K_M.gguf");
+            const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf");
             const llama = await getTestLlama();
 
             const fileInfo = await readGgufFileInfo(modelPath);
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts
index de9b84af..502a0a56 100644
--- a/test/utils/modelFiles.ts
+++ b/test/utils/modelFiles.ts
@@ -10,8 +10,8 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
 const modelsFolder = path.join(__dirname, "..", ".models");
 const supportedModels = {
-    "functionary-small-v2.2.q4_0.gguf": "https://huggingface.co/meetkai/functionary-small-v2.2-GGUF/resolve/main/functionary-small-v2.2.q4_0.gguf?download=true",
-    "stable-code-3b.Q5_K_M.gguf": "https://huggingface.co/TheBloke/stable-code-3b-GGUF/resolve/main/stable-code-3b.Q5_K_M.gguf?download=true",
+    "functionary-small-v2.5.Q4_0.gguf": "https://huggingface.co/meetkai/functionary-small-v2.5-GGUF/resolve/main/functionary-small-v2.5.Q4_0.gguf?download=true",
+    "stable-code-3b-Q5_K_M.gguf": "https://huggingface.co/stabilityai/stable-code-3b/resolve/main/stable-code-3b-Q5_K_M.gguf?download=true",
     "bge-small-en-v1.5-q8_0.gguf": "https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-q8_0.gguf?download=true",
     "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf?download=true"
 } as const;

From 92ab710daff3e89a713d0eba43ed521949fec4c8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 4 Jun 2024 23:48:42 +0300
Subject: [PATCH 21/39] chore: adapt to `llama.cpp` breaking changes

---
 .github/workflows/build.yml                   |  2 +-
 llama/addon.cpp                               | 18 ++---
 src/bindings/AddonTypes.ts                    |  2 +-
 src/bindings/Llama.ts                         |  2 +-
 src/bindings/utils/compileLLamaCpp.ts         | 11 ++-
 .../utils/resolveCustomCmakeOptions.ts        |  1 +
 src/cli/commands/BuildCommand.ts              | 21 ++++-
 .../inspect/commands/InspectMeasureCommand.ts |  2 +-
 src/evaluator/LlamaContext/LlamaContext.ts    |  2 +-
 src/evaluator/LlamaEmbeddingContext.ts        |  2 +-
 src/evaluator/{ => LlamaModel}/LlamaModel.ts  | 44 +++++-----
 .../LlamaModel/utils/TokenAttributes.ts       | 80 +++++++++++++++++++
 src/evaluator/TokenBias.ts                    |  2 +-
 .../GgufInsightsConfigurationResolver.ts      |  2 +-
 .../utils/resolveModelGpuLayersOption.ts      |  2 +-
 src/index.ts                                  |  4 +-
 16 files changed, 150 insertions(+), 47 deletions(-)
 rename src/evaluator/{ => LlamaModel}/LlamaModel.ts (96%)
 create mode 100644 src/evaluator/LlamaModel/utils/TokenAttributes.ts

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 74d49e05..d8f67285 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -198,7 +198,7 @@ jobs:
           async function buildBinary(arch, flags = [], nodeTarget = nodeVersion) {
             console.log(`Building ${arch} for node ${nodeTarget} with flags`, flags);
             
-            await $`node ./dist/cli/cli.js build --noUsageExample --arch ${arch} --nodeTarget ${nodeVersion} ${flags}`;
+            await $`node ./dist/cli/cli.js build --ciMode --noUsageExample --arch ${arch} --nodeTarget ${nodeVersion} ${flags}`;
           }
           
           // build binaries
diff --git a/llama/addon.cpp b/llama/addon.cpp
index 419ed698..274c09fb 100644
--- a/llama/addon.cpp
+++ b/llama/addon.cpp
@@ -222,9 +222,9 @@ static Napi::Value getNapiToken(const Napi::CallbackInfo& info, llama_model* mod
         return Napi::Number::From(info.Env(), -1);
     }
 
-    auto tokenType = llama_token_get_type(model, token);
+    auto tokenAttributes = llama_token_get_attr(model, token);
 
-    if (tokenType == LLAMA_TOKEN_TYPE_UNDEFINED || tokenType == LLAMA_TOKEN_TYPE_UNKNOWN) {
+    if (tokenAttributes & LLAMA_TOKEN_ATTR_UNDEFINED || tokenAttributes & LLAMA_TOKEN_ATTR_UNKNOWN) {
         return Napi::Number::From(info.Env(), -1);
     }
 
@@ -236,9 +236,9 @@ static Napi::Value getNapiControlToken(const Napi::CallbackInfo& info, llama_mod
         return Napi::Number::From(info.Env(), -1);
     }
     
-    auto tokenType = llama_token_get_type(model, token);
+    auto tokenAttributes = llama_token_get_attr(model, token);
 
-    if (tokenType != LLAMA_TOKEN_TYPE_CONTROL && tokenType != LLAMA_TOKEN_TYPE_USER_DEFINED) {
+    if (!(tokenAttributes & LLAMA_TOKEN_ATTR_CONTROL) && !(tokenAttributes & LLAMA_TOKEN_ATTR_UNDEFINED)) {
         return Napi::Number::From(info.Env(), -1);
     }
 
@@ -535,20 +535,20 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
             return Napi::String::New(info.Env(), ss.str());
         }
 
-        Napi::Value GetTokenType(const Napi::CallbackInfo& info) {
+        Napi::Value GetTokenAttributes(const Napi::CallbackInfo& info) {
             if (disposed) {
                 Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
                 return info.Env().Undefined();
             }
 
             if (info[0].IsNumber() == false) {
-                return Napi::Number::From(info.Env(), int32_t(LLAMA_TOKEN_TYPE_UNDEFINED));
+                return Napi::Number::From(info.Env(), int32_t(LLAMA_TOKEN_ATTR_UNDEFINED));
             }
 
             int token = info[0].As<Napi::Number>().Int32Value();
-            auto tokenType = llama_token_get_type(model, token);
+            auto tokenAttributes = llama_token_get_attr(model, token);
 
-            return Napi::Number::From(info.Env(), int32_t(tokenType));
+            return Napi::Number::From(info.Env(), int32_t(tokenAttributes));
         }
         Napi::Value IsEogToken(const Napi::CallbackInfo& info) {
             if (disposed) {
@@ -611,7 +611,7 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
                         InstanceMethod("suffixToken", &AddonModel::SuffixToken),
                         InstanceMethod("eotToken", &AddonModel::EotToken),
                         InstanceMethod("getTokenString", &AddonModel::GetTokenString),
-                        InstanceMethod("getTokenType", &AddonModel::GetTokenType),
+                        InstanceMethod("getTokenAttributes", &AddonModel::GetTokenAttributes),
                         InstanceMethod("isEogToken", &AddonModel::IsEogToken),
                         InstanceMethod("getVocabularyType", &AddonModel::GetVocabularyType),
                         InstanceMethod("shouldPrependBosToken", &AddonModel::ShouldPrependBosToken),
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 1e8c7a65..fc5de2fb 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -83,7 +83,7 @@ export type AddonModel = {
     suffixToken(): Token,
     eotToken(): Token,
     getTokenString(token: number): string,
-    getTokenType(token: Token): number,
+    getTokenAttributes(token: Token): number,
     isEogToken(token: Token): boolean,
     getVocabularyType(): number,
     shouldPrependBosToken(): boolean,
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 0c1712fa..b6ba15a8 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -1,7 +1,7 @@
 import chalk from "chalk";
 import {DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
-import {LlamaModel, LlamaModelOptions} from "../evaluator/LlamaModel.js";
+import {LlamaModel, LlamaModelOptions} from "../evaluator/LlamaModel/LlamaModel.js";
 import {DisposeGuard} from "../utils/DisposeGuard.js";
 import {GbnfJsonSchema} from "../utils/gbnfJson/types.js";
 import {LlamaJsonSchemaGrammar} from "../evaluator/LlamaJsonSchemaGrammar.js";
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 6abb2f8b..31d4c75e 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -30,7 +30,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     ensureLlamaCppRepoIsCloned?: boolean,
     downloadCmakeIfNeeded?: boolean,
     ignoreWorkarounds?: ("cudaArchitecture")[],
-    envVars?: typeof process.env
+    envVars?: typeof process.env,
+    ciMode?: boolean
 }): Promise<void> {
     const {
         nodeTarget = process.version,
@@ -39,7 +40,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
         ensureLlamaCppRepoIsCloned: ensureLlamaCppRepoIsClonedArg = false,
         downloadCmakeIfNeeded: downloadCmakeIfNeededArg = false,
         ignoreWorkarounds = [],
-        envVars = process.env
+        envVars = process.env,
+        ciMode = false
     } = compileOptions;
 
     const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions);
@@ -85,6 +87,11 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
                     cmakeCustomOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
 
+                if (ciMode) {
+                    if (!cmakeCustomOptions.has("LLAMA_OPENMP"))
+                        cmakeCustomOptions.set("LLAMA_OPENMP", "OFF");
+                }
+
                 await fs.remove(outDirectory);
 
                 await spawnCommand(
diff --git a/src/bindings/utils/resolveCustomCmakeOptions.ts b/src/bindings/utils/resolveCustomCmakeOptions.ts
index 1e56d53b..7722390c 100644
--- a/src/bindings/utils/resolveCustomCmakeOptions.ts
+++ b/src/bindings/utils/resolveCustomCmakeOptions.ts
@@ -7,6 +7,7 @@ export function resolveCustomCmakeOptions(customCmakeOptions?: Record<string, st
         : new Map(Object.entries(customCmakeOptions));
 
     if (process.env.LLAMA_METAL === "1") newCustomCmakeOptions.set("LLAMA_METAL", "1");
+    if (process.env.LLAMA_METAL_EMBED_LIBRARY === "1") newCustomCmakeOptions.set("LLAMA_METAL_EMBED_LIBRARY", "1");
     if (process.env.LLAMA_CUDA === "1") newCustomCmakeOptions.set("LLAMA_CUDA", "1");
     if (process.env.LLAMA_VULKAN === "1") newCustomCmakeOptions.set("LLAMA_VULKAN", "1");
 
diff --git a/src/cli/commands/BuildCommand.ts b/src/cli/commands/BuildCommand.ts
index 4d82b7fc..12e5cf36 100644
--- a/src/cli/commands/BuildCommand.ts
+++ b/src/cli/commands/BuildCommand.ts
@@ -26,7 +26,10 @@ type BuildCommand = {
     noUsageExample?: boolean,
 
     /** @internal */
-    noCustomCmakeBuildOptionsInBinaryFolderName?: boolean
+    noCustomCmakeBuildOptionsInBinaryFolderName?: boolean,
+
+    /** @internal */
+    ciMode?: boolean
 };
 
 export const BuildCommand: CommandModule<object, BuildCommand> = {
@@ -69,6 +72,12 @@ export const BuildCommand: CommandModule<object, BuildCommand> = {
                 hidden: true, // this is only for the CI to use
                 default: false,
                 description: "Don't include custom CMake build options in build folder name"
+            })
+            .option("ciMode", {
+                type: "boolean",
+                hidden: true, // this is only for the CI to use
+                default: false,
+                description: "Enable CI only build options"
             });
     },
     handler: BuildLlamaCppCommand
@@ -79,7 +88,12 @@ export async function BuildLlamaCppCommand({
     nodeTarget = undefined,
     gpu = defaultLlamaCppGpuSupport,
     noUsageExample = false,
-    noCustomCmakeBuildOptionsInBinaryFolderName = false
+
+    /** @internal */
+    noCustomCmakeBuildOptionsInBinaryFolderName = false,
+
+    /** @internal */
+    ciMode = false
 }: BuildCommand) {
     if (!(await isLlamaCppRepoCloned())) {
         console.log(chalk.red('llama.cpp is not downloaded. Please run "node-llama-cpp download" first'));
@@ -133,7 +147,8 @@ export async function BuildLlamaCppCommand({
                     updateLastBuildInfo: true,
                     downloadCmakeIfNeeded: false,
                     ensureLlamaCppRepoIsCloned: false,
-                    includeBuildOptionsInBinaryFolderName
+                    includeBuildOptionsInBinaryFolderName,
+                    ciMode: isCI && ciMode
                 });
             });
         } catch (err) {
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index 10aec1ff..abfd6635 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -10,7 +10,7 @@ import {readGgufFileInfo} from "../../../../gguf/readGgufFileInfo.js";
 import {resolveCommandGgufPath} from "../../../utils/resolveCommandGgufPath.js";
 import {getLlama} from "../../../../bindings/getLlama.js";
 import {BuildGpu, LlamaLogLevel, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption} from "../../../../bindings/types.js";
-import {LlamaModel} from "../../../../evaluator/LlamaModel.js";
+import {LlamaModel} from "../../../../evaluator/LlamaModel/LlamaModel.js";
 import {getConsoleLogPrefix} from "../../../../utils/getConsoleLogPrefix.js";
 import {ConsoleTable, ConsoleTableColumn} from "../../../utils/ConsoleTable.js";
 import {GgufInsights} from "../../../../gguf/insights/GgufInsights.js";
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 8a8763af..8938d27c 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -13,7 +13,7 @@ import {
 } from "./types.js";
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import type {Llama} from "../../bindings/Llama.js";
-import type {LlamaModel} from "../LlamaModel.js";
+import type {LlamaModel} from "../LlamaModel/LlamaModel.js";
 
 
 export class LlamaContext {
diff --git a/src/evaluator/LlamaEmbeddingContext.ts b/src/evaluator/LlamaEmbeddingContext.ts
index 24b4451e..b9d8211e 100644
--- a/src/evaluator/LlamaEmbeddingContext.ts
+++ b/src/evaluator/LlamaEmbeddingContext.ts
@@ -2,7 +2,7 @@ import {AsyncDisposeAggregator, EventRelay, withLock} from "lifecycle-utils";
 import {Token} from "../types.js";
 import {LlamaText} from "../utils/LlamaText.js";
 import {tokenizeInput} from "../utils/tokenizeInput.js";
-import type {LlamaModel} from "./LlamaModel.js";
+import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
 import type {LlamaContext, LlamaContextSequence} from "./LlamaContext/LlamaContext.js";
 
 export type LlamaEmbeddingContextOptions = {
diff --git a/src/evaluator/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
similarity index 96%
rename from src/evaluator/LlamaModel.ts
rename to src/evaluator/LlamaModel/LlamaModel.ts
index 1feac17b..a833e614 100644
--- a/src/evaluator/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -1,23 +1,23 @@
 import process from "process";
 import path from "path";
 import {AsyncDisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
-import {removeNullFields} from "../utils/removeNullFields.js";
-import {Token, Tokenizer} from "../types.js";
-import {AddonModel, ModelTypeDescription} from "../bindings/AddonTypes.js";
-import {DisposalPreventionHandle, DisposeGuard} from "../utils/DisposeGuard.js";
-import {LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../bindings/types.js";
-import {GgufFileInfo} from "../gguf/types/GgufFileInfoTypes.js";
-import {readGgufFileInfo} from "../gguf/readGgufFileInfo.js";
-import {GgufInsights} from "../gguf/insights/GgufInsights.js";
-import {GgufMetadataTokenizerTokenType} from "../gguf/types/GgufMetadataTypes.js";
-import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
-import {Writable} from "../utils/utilTypes.js";
-import {getReadablePath} from "../cli/utils/getReadablePath.js";
-import {LlamaContextOptions} from "./LlamaContext/types.js";
-import {LlamaContext} from "./LlamaContext/LlamaContext.js";
-import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "./LlamaEmbeddingContext.js";
-import type {Llama} from "../bindings/Llama.js";
-import type {BuiltinSpecialTokenValue} from "../utils/LlamaText.js";
+import {removeNullFields} from "../../utils/removeNullFields.js";
+import {Token, Tokenizer} from "../../types.js";
+import {AddonModel, ModelTypeDescription} from "../../bindings/AddonTypes.js";
+import {DisposalPreventionHandle, DisposeGuard} from "../../utils/DisposeGuard.js";
+import {LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../../bindings/types.js";
+import {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js";
+import {readGgufFileInfo} from "../../gguf/readGgufFileInfo.js";
+import {GgufInsights} from "../../gguf/insights/GgufInsights.js";
+import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
+import {Writable} from "../../utils/utilTypes.js";
+import {getReadablePath} from "../../cli/utils/getReadablePath.js";
+import {LlamaContextOptions} from "../LlamaContext/types.js";
+import {LlamaContext} from "../LlamaContext/LlamaContext.js";
+import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbeddingContext.js";
+import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
+import type {Llama} from "../../bindings/Llama.js";
+import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
 
 export type LlamaModelOptions = {
     /** path to the model on the filesystem */
@@ -357,11 +357,11 @@ export class LlamaModel {
         return this._model.detokenize(Uint32Array.from(tokens), Boolean(specialTokens));
     }
 
-    public getTokenType(token: Token): GgufMetadataTokenizerTokenType | null {
+    public getTokenAttributes(token: Token): TokenAttributes {
         if (this.vocabularyType === LlamaVocabularyType.none)
-            return null;
+            return TokenAttributes._create(token, TokenAttribute.undefined);
 
-        return this._model.getTokenType(token) as GgufMetadataTokenizerTokenType;
+        return TokenAttributes._create(token, this._model.getTokenAttributes(token));
     }
 
     /** Check whether the given token is a special token (a control-type token) */
@@ -369,9 +369,7 @@ export class LlamaModel {
         if (token == null)
             return false;
 
-        const tokenType = this.getTokenType(token);
-
-        return tokenType === GgufMetadataTokenizerTokenType.control;
+        return this.getTokenAttributes(token).control;
     }
 
     /** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
diff --git a/src/evaluator/LlamaModel/utils/TokenAttributes.ts b/src/evaluator/LlamaModel/utils/TokenAttributes.ts
new file mode 100644
index 00000000..d5c9938e
--- /dev/null
+++ b/src/evaluator/LlamaModel/utils/TokenAttributes.ts
@@ -0,0 +1,80 @@
+import {Token} from "../../../types.js";
+
+// updated against `enum llama_token_attr` from `llama.h`
+export const enum TokenAttribute {
+    undefined = 0,
+    unknown = 1 << 1,
+    unused = 1 << 2,
+    normal = 1 << 3,
+    control = 1 << 4,  // SPECIAL
+    userDefined = 1 << 5,
+    byte = 1 << 6,
+    normalized = 1 << 7,
+    lstrip = 1 << 8,
+    rstrip = 1 << 9,
+    singleWord = 1 << 10,
+}
+
+export class TokenAttributes {
+    public readonly token: Token;
+    /** @internal */ private readonly _attributes: TokenAttribute;
+
+    private constructor(token: Token, attributes: TokenAttribute) {
+        this.token = token;
+        this._attributes = attributes;
+    }
+
+    public get undefined() {
+        return this._hasAttribute(TokenAttribute.undefined);
+    }
+
+    public get unknown() {
+        return this._hasAttribute(TokenAttribute.unknown);
+    }
+
+    public get unused() {
+        return this._hasAttribute(TokenAttribute.unused);
+    }
+
+    public get normal() {
+        return this._hasAttribute(TokenAttribute.normal);
+    }
+
+    public get control() {
+        return this._hasAttribute(TokenAttribute.control);
+    }
+
+    public get userDefined() {
+        return this._hasAttribute(TokenAttribute.userDefined);
+    }
+
+    public get byte() {
+        return this._hasAttribute(TokenAttribute.byte);
+    }
+
+    public get normalized() {
+        return this._hasAttribute(TokenAttribute.normalized);
+    }
+
+    public get lstrip() {
+        return this._hasAttribute(TokenAttribute.lstrip);
+    }
+
+    public get rstrip() {
+        return this._hasAttribute(TokenAttribute.rstrip);
+    }
+
+    public get singleWord() {
+        return this._hasAttribute(TokenAttribute.singleWord);
+    }
+
+    /** @internal */
+    private _hasAttribute(attribute: TokenAttribute) {
+        return (this._attributes & attribute) === attribute;
+    }
+
+    /** @internal */
+    public static _create(token: Token, attributes: TokenAttribute) {
+        return new TokenAttributes(token, attributes);
+    }
+}
diff --git a/src/evaluator/TokenBias.ts b/src/evaluator/TokenBias.ts
index a8c5741e..bf91d6a7 100644
--- a/src/evaluator/TokenBias.ts
+++ b/src/evaluator/TokenBias.ts
@@ -1,7 +1,7 @@
 import {Token} from "../types.js";
 import {LlamaText} from "../utils/LlamaText.js";
 import {tokenizeInput} from "../utils/tokenizeInput.js";
-import {LlamaModel} from "./LlamaModel.js";
+import {LlamaModel} from "./LlamaModel/LlamaModel.js";
 
 export class TokenBias {
     /** @internal */ public readonly _model: LlamaModel;
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index 0d04c20c..5ac1b204 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -1,6 +1,6 @@
 import os from "os";
 import {BuildGpu} from "../../bindings/types.js";
-import {LlamaModelOptions} from "../../evaluator/LlamaModel.js";
+import {LlamaModelOptions} from "../../evaluator/LlamaModel/LlamaModel.js";
 import {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js";
 import {getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js";
 import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.js";
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index 05570a7c..df938e7c 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -1,4 +1,4 @@
-import {LlamaModelOptions} from "../../../evaluator/LlamaModel.js";
+import {LlamaModelOptions} from "../../../evaluator/LlamaModel/LlamaModel.js";
 import {BuildGpu} from "../../../bindings/types.js";
 import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js";
 import {findBestOption} from "../../../utils/findBestOption.js";
diff --git a/src/index.ts b/src/index.ts
index bbc2b87f..b51a9b6d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,7 +3,8 @@ import {Llama} from "./bindings/Llama.js";
 import {getLlama, LlamaOptions} from "./bindings/getLlama.js";
 import {NoBinaryFoundError} from "./bindings/utils/NoBinaryFoundError.js";
 import {LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType} from "./bindings/types.js";
-import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel.js";
+import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel/LlamaModel.js";
+import {TokenAttributes} from "./evaluator/LlamaModel/utils/TokenAttributes.js";
 import {LlamaGrammar, type LlamaGrammarOptions} from "./evaluator/LlamaGrammar.js";
 import {LlamaJsonSchemaGrammar} from "./evaluator/LlamaJsonSchemaGrammar.js";
 import {LlamaJsonSchemaValidationError} from "./utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js";
@@ -91,6 +92,7 @@ export {
     LlamaModel,
     LlamaModelTokens,
     LlamaModelInfillTokens,
+    TokenAttributes,
     type LlamaModelOptions,
     LlamaGrammar,
     type LlamaGrammarOptions,

From 0df534b5cabafc56339cf5f4a8f4cfebe0313549 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 5 Jun 2024 20:42:20 +0300
Subject: [PATCH 22/39] test: update tests

---
 .../functionaryModelGpuLayersOptions.test.ts  | 48 ++++++-------------
 .../functionary/gguf/ggufInsights.test.ts     | 12 ++---
 2 files changed, 21 insertions(+), 39 deletions(-)

diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 08b94300..90f60333 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -295,18 +295,10 @@ describe("functionary", () => {
                 }{
                     const res = resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("561");
-                }
-                {
-                    const res = resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.4
+                        freeVram: s1GB * 4.7
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("2701");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("607");
                 }
                 {
                     const res = resolveGpuLayers("max", {
@@ -314,7 +306,7 @@ describe("functionary", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4840");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1142");
                 }
             });
 
@@ -338,18 +330,18 @@ describe("functionary", () => {
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.8
+                        freeVram: s1GB * 1.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5192");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 1.4
+                        freeVram: s1GB * 1.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5164");
                 }
                 {
                     const res = resolveGpuLayers("auto", {
@@ -488,7 +480,7 @@ describe("functionary", () => {
                 {
                     const res = resolveGpuLayers({max: 16}, {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
+                        freeVram: s1GB * 3.8
                     });
                     expect(res.gpuLayers).to.eql(16);
                     expect(res.contextSize).to.toMatchInlineSnapshot("8192");
@@ -559,10 +551,10 @@ describe("functionary", () => {
                     const contextSize = 4096;
                     const res = resolveGpuLayers({fitContext: {contextSize}}, {
                         totalVram: s1GB * 2,
-                        freeVram: s1GB * 1
+                        freeVram: s1GB * 1.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5164");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -579,9 +571,9 @@ describe("functionary", () => {
                     const contextSize = 8192;
                     const res = resolveGpuLayers({fitContext: {contextSize}}, {
                         totalVram: s1GB * 1,
-                        freeVram: s1GB * 1
+                        freeVram: s1GB * 1.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
                     expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
@@ -606,16 +598,6 @@ describe("functionary", () => {
                         expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                     }
                 }
-                {
-                    const contextSize = 16384;
-                    const res = resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
             });
         });
     });
diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
index 1162d25d..7e7a913d 100644
--- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
@@ -92,12 +92,12 @@ describe("gguf", async () => {
 
             const modelVramUsageDiff = currentModelVramUsage - initialModelVramUsage;
 
-            const s100MB = 100 * Math.pow(1024, 2);
+            const s300MB = 300 * Math.pow(1024, 2);
             const s5MB = 5 * Math.pow(1024, 2);
 
             const estimatedModelVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: ggufInsights.totalLayers}).gpuVram;
             expect(bytes(estimatedModelVramUsage)).toMatchInlineSnapshot('"4.33GB"');
-            expect(Math.abs(modelVramUsageDiff - estimatedModelVramUsage)).to.be.lte(s100MB);
+            expect(Math.abs(modelVramUsageDiff - estimatedModelVramUsage)).to.be.lte(s300MB);
 
             const modelEstimationDiffWithActual = estimatedModelVramUsage - model.size;
             expect(Math.abs(modelEstimationDiffWithActual)).to.be.lte(s5MB); // tolerate such a small difference
@@ -121,8 +121,8 @@ describe("gguf", async () => {
                 sequences: context.totalSequences,
                 modelGpuLayers: ggufInsights.totalLayers
             }).gpuVram;
-            expect(bytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"854.74MB"');
-            expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s100MB);
+            expect(bytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"1.02GB"');
+            expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s300MB);
 
             await model.dispose();
         });
@@ -145,10 +145,10 @@ describe("gguf", async () => {
 
             const vramUsageDiff = currentVramUsage - initialVramUsage;
 
-            const s100MB = 100 * Math.pow(1024, 2);
+            const s200MB = 200 * Math.pow(1024, 2);
             const calculatedVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}).gpuVram;
 
-            expect(Math.abs(vramUsageDiff - calculatedVramUsage)).to.be.lte(s100MB);
+            expect(Math.abs(vramUsageDiff - calculatedVramUsage)).to.be.lte(s200MB);
 
             await model.dispose();
         });

From 2e4fca6262015b4637deb58597d54438e489fdda Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 5 Jun 2024 22:43:30 +0300
Subject: [PATCH 23/39] feat: call function calling handle functions earlier

---
 src/evaluator/LlamaChat/LlamaChat.ts          | 21 ++++-
 .../LlamaChatSession/LlamaChatSession.ts      | 83 +++++++++++++++----
 src/evaluator/LlamaCompletion.ts              |  5 +-
 src/utils/safeEventCallback.ts                | 47 +++++++++++
 4 files changed, 133 insertions(+), 23 deletions(-)
 create mode 100644 src/utils/safeEventCallback.ts

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 85326a4a..7b4ce353 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -17,6 +17,7 @@ import {getQueuedTokensBeforeStopTrigger} from "../../utils/getQueuedTokensBefor
 import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
 import {GeneralChatWrapper} from "../../chatWrappers/GeneralChatWrapper.js";
 import {TokenBias} from "../TokenBias.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {
     eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy
 } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
@@ -139,12 +140,16 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
     grammar?: LlamaGrammar,
     functions?: never,
     documentFunctionParams?: never,
-    maxParallelFunctionCalls?: never
+    maxParallelFunctionCalls?: never,
+    onFunctionCall?: never
 } | {
     grammar?: never,
     functions?: Functions | ChatModelFunctions,
     documentFunctionParams?: boolean,
-    maxParallelFunctionCalls?: number
+    maxParallelFunctionCalls?: number,
+    onFunctionCall?: (
+        functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>
+    ) => void
 });
 
 export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
@@ -339,6 +344,7 @@ export class LlamaChat {
             tokenBias,
             evaluationPriority = defaultEvaluationPriority,
             functions,
+            onFunctionCall,
             documentFunctionParams,
             maxParallelFunctionCalls,
             contextShift = defaultContextShiftOptions,
@@ -368,6 +374,7 @@ export class LlamaChat {
                 tokenBias,
                 evaluationPriority,
                 functions,
+                onFunctionCall,
                 documentFunctionParams,
                 maxParallelFunctionCalls,
                 contextShift,
@@ -1174,6 +1181,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private readonly tokenBias: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
     private readonly evaluationPriority: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
     private readonly functions: LLamaChatGenerateResponseOptions<Functions>["functions"];
+    private readonly onFunctionCall: LLamaChatGenerateResponseOptions<Functions>["onFunctionCall"];
     private readonly documentFunctionParams: LLamaChatGenerateResponseOptions<Functions>["documentFunctionParams"];
     private readonly maxParallelFunctionCalls: LLamaChatGenerateResponseOptions<Functions>["maxParallelFunctionCalls"];
     private readonly contextShift: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
@@ -1270,6 +1278,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             tokenBias,
             evaluationPriority = defaultEvaluationPriority,
             functions,
+            onFunctionCall,
             documentFunctionParams,
             maxParallelFunctionCalls,
             contextShift = defaultContextShiftOptions,
@@ -1284,7 +1293,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.chatWrapper = chatWrapper;
 
         this.history = history;
-        this.onToken = onToken;
+        this.onToken = safeEventCallback(onToken);
         this.signal = signal;
         this.stopOnAbortSignal = stopOnAbortSignal;
         this.maxTokens = maxTokens;
@@ -1297,6 +1306,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.tokenBias = tokenBias;
         this.evaluationPriority = evaluationPriority;
         this.functions = functions;
+        this.onFunctionCall = safeEventCallback(onFunctionCall);
         this.documentFunctionParams = documentFunctionParams;
         this.maxParallelFunctionCalls = maxParallelFunctionCalls;
         this.contextShift = contextShift;
@@ -1885,6 +1895,11 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                     params,
                     raw: functionCallText
                 });
+                this.onFunctionCall?.({
+                    functionName: this.functionEvaluationFunctionName,
+                    params: structuredClone(params),
+                    raw: functionCallText.toJSON()
+                });
                 this.currentFunctionCallPreviousText = LlamaText([]);
                 this.currentFunctionCallCurrentPartTokens.length = 0;
                 this.functionEvaluationFunctionName = "";
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 4edb0e19..c291f3d6 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -1,15 +1,18 @@
 import {DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {defaultChatSystemPrompt} from "../../config.js";
 import {ChatWrapper} from "../../ChatWrapper.js";
-import {ChatHistoryItem, ChatModelResponse, ChatSessionModelFunctions, Token} from "../../types.js";
+import {
+    ChatHistoryItem, ChatModelFunctions, ChatModelResponse, ChatSessionModelFunction, ChatSessionModelFunctions, Token
+} from "../../types.js";
 import {appendUserMessageToChatHistory} from "../../utils/appendUserMessageToChatHistory.js";
 import {LlamaContextSequence} from "../LlamaContext/LlamaContext.js";
 import {LlamaGrammar} from "../LlamaGrammar.js";
-import {LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse} from "../LlamaChat/LlamaChat.js";
+import {LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse, LlamaChatResponseFunctionCall} from "../LlamaChat/LlamaChat.js";
 import {EvaluationPriority} from "../LlamaContext/types.js";
 import {TokenBias} from "../TokenBias.js";
 import {LlamaText, LlamaTextJSON} from "../../utils/LlamaText.js";
 import {wrapAbortSignal} from "../../utils/wrapAbortSignal.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {
     LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine
 } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
@@ -408,6 +411,7 @@ export class LlamaChatSession {
             if (this._chat == null)
                 throw new DisposedError();
 
+            const abortController = wrapAbortSignal(signal);
             let lastEvaluation = this._lastEvaluation;
             let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
             let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
@@ -427,9 +431,15 @@ export class LlamaChatSession {
 
             // eslint-disable-next-line no-constant-condition
             while (true) {
+                const functionCallsAndResults: Array<Promise<null | {
+                    functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>,
+                    functionDefinition: ChatSessionModelFunction<any>,
+                    functionCallResult: any
+                }>> = [];
+                let canThrowFunctionCallingErrors = false;
+
                 const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
                 const {
-                    functionCalls,
                     lastEvaluation: currentLastEvaluation,
                     metadata
                 } = await this._chat.generateResponse<Functions>(newChatHistory, {
@@ -437,8 +447,8 @@ export class LlamaChatSession {
                     documentFunctionParams,
                     maxParallelFunctionCalls,
                     grammar: grammar as undefined, // this is a workaround to allow passing both `functions` and `grammar`
-                    onToken,
-                    signal,
+                    onToken: safeEventCallback(onToken),
+                    signal: abortController.signal,
                     stopOnAbortSignal,
                     repeatPenalty,
                     minP,
@@ -457,9 +467,40 @@ export class LlamaChatSession {
                     lastEvaluationContextWindow: {
                         history: newContextWindowChatHistory,
                         minimumOverlapPercentageToPreventContextShift: 0.5
+                    },
+                    onFunctionCall: async(functionCall) => {
+                        functionCallsAndResults.push(
+                            (async () => {
+                                try {
+                                    const functionDefinition = functions?.[functionCall.functionName];
+
+                                    if (functionDefinition == null)
+                                        throw new Error(
+                                            `The model tried to call function "${functionCall.functionName}" which is not defined`
+                                        );
+
+                                    const functionCallResult = await functionDefinition.handler(functionCall.params);
+
+                                    return {
+                                        functionCall,
+                                        functionDefinition,
+                                        functionCallResult
+                                    };
+                                } catch (err) {
+                                    abortController.abort(err);
+
+                                    if (canThrowFunctionCallingErrors)
+                                        throw err;
+
+                                    return null;
+                                }
+                            })()
+                        );
                     }
                 });
                 this._ensureNotDisposed();
+                if (abortController.signal.aborted)
+                    throw abortController.signal.reason;
 
                 if (maxTokens != null)
                     maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
@@ -467,25 +508,31 @@ export class LlamaChatSession {
                 lastEvaluation = currentLastEvaluation;
                 newChatHistory = lastEvaluation.cleanHistory;
 
-                if (functionCalls != null && functionCalls.length > 0) {
-                    const functionCallAndResults = await Promise.all(
-                        functionCalls.map(async (functionCall) => {
-                            const functionDefinition = functions?.[functionCall.functionName];
-
-                            if (functionDefinition == null)
-                                throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
+                if (functionCallsAndResults.length > 0) {
+                    canThrowFunctionCallingErrors = true;
+                    const functionCallResultsPromise = Promise.all(functionCallsAndResults);
+                    await Promise.race([
+                        functionCallResultsPromise,
+                        new Promise((_, reject) => {
+                            abortController.signal.addEventListener("abort", () => {
+                                reject(abortController.signal.reason);
+                            });
+
+                            if (abortController.signal.aborted)
+                                reject(abortController.signal.reason);
+                        })
+                    ])
 
-                            const functionCallResult = await functionDefinition.handler(functionCall.params);
-                            this._ensureNotDisposed();
+                    const functionCallResults = (await functionCallResultsPromise)
+                        .filter((result): result is Exclude<typeof result, null> => result != null);;
 
-                            return [functionCall, functionDefinition, functionCallResult] as const;
-                        })
-                    );
                     this._ensureNotDisposed();
+                    if (abortController.signal.aborted)
+                        throw abortController.signal.reason;
 
                     newContextWindowChatHistory = lastEvaluation.contextWindow;
 
-                    for (const [functionCall, functionDefinition, functionCallResult] of functionCallAndResults) {
+                    for (const {functionCall, functionDefinition, functionCallResult} of functionCallResults) {
                         newChatHistory = addFunctionCallToChatHistory({
                             chatHistory: newChatHistory,
                             functionName: functionCall.functionName,
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index 59be21df..0198cb65 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -8,6 +8,7 @@ import {QueuedTokenReleaseLock, TokenStreamRegulator} from "../utils/TokenStream
 import {StopGenerationDetector, StopGenerationTrigger} from "../utils/StopGenerationDetector.js";
 import {UNKNOWN_UNICODE_CHAR} from "../consts.js";
 import {getQueuedTokensBeforeStopTrigger} from "../utils/getQueuedTokensBeforeStopTrigger.js";
+import {safeEventCallback} from "../utils/safeEventCallback.js";
 import {LlamaGrammarEvaluationState} from "./LlamaGrammarEvaluationState.js";
 import {LlamaGrammar} from "./LlamaGrammar.js";
 import {EvaluationPriority} from "./LlamaContext/types.js";
@@ -274,7 +275,7 @@ export class LlamaCompletion {
                     : this._sequence.context.contextSize - inputTokens.length;
 
             return await this._generateResponse(inputTokens, {
-                onToken,
+                onToken: safeEventCallback(onToken),
                 signal,
                 maxTokens: resolvedMaxTokens,
                 temperature,
@@ -457,7 +458,7 @@ export class LlamaCompletion {
                     : this._sequence.context.contextSize - inputTokens.length;
 
             return await this._generateResponse(inputTokens, {
-                onToken,
+                onToken: safeEventCallback(onToken),
                 signal,
                 maxTokens: resolvedMaxTokens,
                 temperature,
diff --git a/src/utils/safeEventCallback.ts b/src/utils/safeEventCallback.ts
new file mode 100644
index 00000000..89c99b45
--- /dev/null
+++ b/src/utils/safeEventCallback.ts
@@ -0,0 +1,47 @@
+const safeCallbackSymbol = Symbol("safeCallback");
+
+/**
+ * Wraps a callback in a try-catch block and logs any errors to the console
+ */
+export function safeEventCallback<const Params extends any[]>(
+    callback: ((...args: Params) => void) | ((...args: Params) => Promise<void>) | ((...args: Params) => void | Promise<void>),
+    message?: string
+): ((...args: Params) => void);
+export function safeEventCallback(callback?: undefined | void | never, message?: string): undefined;
+export function safeEventCallback<const Params extends any[] = any[]>(
+    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) | ((...args: Params) => void | Promise<void>),
+    message?: string
+): undefined | ((...args: Params) => void);
+export function safeEventCallback<const Params extends any[] = any[]>(
+    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) | ((...args: Params) => void | Promise<void>),
+    message?: string
+): undefined | ((...args: Params) => void) {
+    if (callback == null)
+        return undefined;
+
+    // do not wrap the callback if it's already wrapped
+    if ((callback as any)?.[safeCallbackSymbol] === true)
+        return callback;
+
+    const res = (...args: Params) => {
+        try {
+            const res = callback(...args);
+
+            if (res instanceof Promise)
+                res.catch((error) => {
+                    if (message != null)
+                        console.error(message, error);
+                    else
+                        console.error(error);
+                });
+        } catch (error) {
+            if (message != null)
+                console.error(message, error);
+            else
+                console.error(error);
+        }
+    };
+    res[safeCallbackSymbol] = true;
+
+    return res;
+}

From be06e0b67293197952dbae51f9c2c42cea8be04a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 5 Jun 2024 23:48:01 +0300
Subject: [PATCH 24/39] feat: improve function calling with plain Llama 3
 Instruct

---
 src/chatWrappers/Llama3ChatWrapper.ts         |  20 +-
 .../LlamaChatSession/LlamaChatSession.ts      |   2 +-
 test/modelDependent/llama3/functions.test.ts  | 208 ++++++++++++++++++
 3 files changed, 222 insertions(+), 8 deletions(-)
 create mode 100644 test/modelDependent/llama3/functions.test.ts

diff --git a/src/chatWrappers/Llama3ChatWrapper.ts b/src/chatWrappers/Llama3ChatWrapper.ts
index ce8d61b1..6e146397 100644
--- a/src/chatWrappers/Llama3ChatWrapper.ts
+++ b/src/chatWrappers/Llama3ChatWrapper.ts
@@ -15,13 +15,16 @@ export class Llama3ChatWrapper extends ChatWrapper {
         functions: {
             call: {
                 optionalPrefixSpace: true,
-                prefix: "[[call: ",
-                paramsPrefix: "(",
-                suffix: ")]]"
+                prefix: "||call:",
+                paramsPrefix: LlamaText(new SpecialTokensText("(")),
+                suffix: LlamaText(new SpecialTokensText(")"))
             },
             result: {
-                prefix: " [[result: ",
-                suffix: "]]"
+                prefix: LlamaText([
+                    LlamaText(new SpecialToken("EOT")),
+                    new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")
+                ]),
+                suffix: LlamaText(new SpecialToken("EOT"), new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n"))
             }
         }
     };
@@ -159,9 +162,12 @@ export class Llama3ChatWrapper extends ChatWrapper {
             "```",
             "",
             "Calling any of the provided functions can be done like this:",
-            this.generateFunctionCall("functionName", {someKey: "someValue"}),
+            this.generateFunctionCall("getSomeInfo", {someKey: "someValue"}),
             "",
-            "After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards."
+            "Note that the || prefix is mandatory",
+            "The assistant does not inform the user about using functions and does not explain anything before calling a function.",
+            "After calling a function, the raw result appears afterwards and is not part of the conversation",
+            "To make information be part of the conversation, the paraphrases and repeats the information without the function syntax."
         ]);
     }
 }
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index c291f3d6..5dc78582 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -521,7 +521,7 @@ export class LlamaChatSession {
                             if (abortController.signal.aborted)
                                 reject(abortController.signal.reason);
                         })
-                    ])
+                    ]);
 
                     const functionCallResults = (await functionCallResultsPromise)
                         .filter((result): result is Exclude<typeof result, null> => result != null);;
diff --git a/test/modelDependent/llama3/functions.test.ts b/test/modelDependent/llama3/functions.test.ts
new file mode 100644
index 00000000..7ac52ee1
--- /dev/null
+++ b/test/modelDependent/llama3/functions.test.ts
@@ -0,0 +1,208 @@
+import {describe, expect, test} from "vitest";
+import {defineChatSessionFunction, LlamaChatSession, LlamaJsonSchemaGrammar, Token} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("llama 3", () => {
+    describe("functions", () => {
+        test("get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const totalTokens = model.fileInfo.metadata.tokenizer.ggml.tokens.length;
+            for (let i = 0; i < totalTokens; i++) {
+                const tokenAttributes = model.getTokenAttributes(i as Token);
+                if (tokenAttributes.control)
+                    console.log("control", model.detokenize([i as Token], true));
+            }
+
+            const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
+                functions: {
+                    getNthWord: defineChatSessionFunction({
+                        description: "Get an n-th word",
+                        params: {
+                            type: "object",
+                            properties: {
+                                n: {
+                                    enum: [1, 2, 3, 4]
+                                }
+                            }
+                        },
+                        handler(params) {
+                            return ["very", "secret", "this", "hello"][params.n - 1];
+                        }
+                    })
+                }
+            } as const;
+
+            const res = await chatSession.prompt("What is the second word?", promptOptions);
+
+            console.log(chatSession.sequence.model.detokenize(chatSession.sequence.contextTokens, true));
+
+            expect(res).to.be.eq('The second word is "secret".');
+
+            const res2 = await chatSession.prompt("Explain what this word means", {
+                ...promptOptions,
+                maxTokens: 40
+            });
+
+            expect(res2.length).to.be.greaterThan(1);
+        });
+
+        test("async get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
+                functions: {
+                    getNthWord: defineChatSessionFunction({
+                        description: "Get an n-th word",
+                        params: {
+                            type: "object",
+                            properties: {
+                                n: {
+                                    enum: [1, 2, 3, 4]
+                                }
+                            }
+                        },
+                        async handler(params) {
+                            return ["very", "secret", "this", "hello"][params.n - 1];
+                        }
+                    })
+                }
+            } as const;
+
+            const res = await chatSession.prompt("What is the second word?", promptOptions);
+
+            expect(res).to.be.eq('The second word is "secret".');
+
+            const res2 = await chatSession.prompt("Explain what this word means", {
+                ...promptOptions,
+                maxTokens: 40
+            });
+
+            expect(res2.length).to.be.greaterThan(1);
+        });
+
+        test("async get n-th word twice", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
+                functions: {
+                    getNthWord: defineChatSessionFunction({
+                        description: "Get an n-th word",
+                        params: {
+                            type: "object",
+                            properties: {
+                                n: {
+                                    enum: [1, 2, 3, 4]
+                                }
+                            }
+                        },
+                        async handler(params) {
+                            return ["very", "secret", "this", "hello"][params.n - 1];
+                        }
+                    })
+                }
+            } as const;
+
+            const res = await chatSession.prompt("what are the first and second words?", promptOptions);
+
+            expect(res).to.be.eq('The first word is "very" and the second word is "secret".');
+
+            const res2 = await chatSession.prompt("Explain what these words mean", {
+                ...promptOptions,
+                maxTokens: 40
+            });
+
+            expect(res2.length).to.be.greaterThan(1);
+        });
+    });
+
+    describe("functions and grammar", () => {
+        test("get n-th word", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const res = await chatSession.prompt("What is the second word?", {
+                functions: {
+                    getNthWord: defineChatSessionFunction({
+                        description: "Get an n-th word",
+                        params: {
+                            type: "object",
+                            properties: {
+                                n: {
+                                    enum: [1, 2, 3, 4]
+                                }
+                            }
+                        },
+                        handler(params) {
+                            return ["very", "secret", "this", "hello"][params.n - 1];
+                        }
+                    })
+                }
+            });
+
+            expect(res).to.be.eq('The second word is "secret".');
+
+            const res2SchemaGrammar = new LlamaJsonSchemaGrammar(llama, {
+                type: "object",
+                properties: {
+                    word: {
+                        type: "string"
+                    }
+                }
+            });
+
+            const res2 = await chatSession.prompt("Repeat your response", {
+                grammar: res2SchemaGrammar
+            });
+
+            const parsedRes2 = res2SchemaGrammar.parse(res2);
+
+            expect(parsedRes2).to.eql({word: "secret"});
+        });
+    });
+});

From 2edd7f616caad9a6c8a92d21fc743358d687f2b7 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 5 Jun 2024 23:57:56 +0300
Subject: [PATCH 25/39] feat: parallel function calling with plain Llama 3
 Instruct

---
 src/chatWrappers/Llama3ChatWrapper.ts        | 80 +++++++++++++++-----
 test/modelDependent/llama3/functions.test.ts | 11 +--
 2 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/src/chatWrappers/Llama3ChatWrapper.ts b/src/chatWrappers/Llama3ChatWrapper.ts
index 6e146397..9907d420 100644
--- a/src/chatWrappers/Llama3ChatWrapper.ts
+++ b/src/chatWrappers/Llama3ChatWrapper.ts
@@ -10,24 +10,66 @@ import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctio
 export class Llama3ChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Llama3Chat";
 
-    public override readonly settings: ChatWrapperSettings = {
-        supportsSystemMessages: true,
-        functions: {
-            call: {
-                optionalPrefixSpace: true,
-                prefix: "||call:",
-                paramsPrefix: LlamaText(new SpecialTokensText("(")),
-                suffix: LlamaText(new SpecialTokensText(")"))
-            },
-            result: {
-                prefix: LlamaText([
-                    LlamaText(new SpecialToken("EOT")),
-                    new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")
-                ]),
-                suffix: LlamaText(new SpecialToken("EOT"), new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n"))
-            }
-        }
-    };
+    public override readonly settings: ChatWrapperSettings;
+
+    public constructor({
+        parallelFunctionCalls = true
+    }: {
+        parallelFunctionCalls?: boolean
+    }) {
+        super();
+
+        if (parallelFunctionCalls)
+            this.settings = {
+                supportsSystemMessages: true,
+                functions: {
+                    call: {
+                        optionalPrefixSpace: true,
+                        prefix: "||call:",
+                        paramsPrefix: LlamaText(new SpecialTokensText("(")),
+                        suffix: LlamaText(new SpecialTokensText(")"))
+                    },
+                    result: {
+                        prefix: LlamaText(new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")),
+                        suffix: LlamaText(new SpecialToken("EOT"))
+                    },
+                    parallelism: {
+                        call: {
+                            sectionPrefix: "",
+                            betweenCalls: "\n",
+                            sectionSuffix: LlamaText(new SpecialToken("EOT"))
+                        },
+                        result: {
+                            sectionPrefix: "",
+                            betweenResults: "",
+                            sectionSuffix: LlamaText(new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n"))
+                        }
+                    }
+                }
+            };
+        else
+            this.settings = {
+                supportsSystemMessages: true,
+                functions: {
+                    call: {
+                        optionalPrefixSpace: true,
+                        prefix: "||call:",
+                        paramsPrefix: LlamaText(new SpecialTokensText("(")),
+                        suffix: LlamaText(new SpecialTokensText(")"))
+                    },
+                    result: {
+                        prefix: LlamaText([
+                            LlamaText(new SpecialToken("EOT")),
+                            new SpecialTokensText("<|start_header_id|>function_call_result<|end_header_id|>\n\n")
+                        ]),
+                        suffix: LlamaText([
+                            new SpecialToken("EOT"),
+                            new SpecialTokensText("<|start_header_id|>assistant<|end_header_id|>\n\n")
+                        ])
+                    }
+                }
+            };
+    }
 
     public override generateContextState({
         chatHistory, availableFunctions, documentFunctionParams
@@ -167,7 +209,7 @@ export class Llama3ChatWrapper extends ChatWrapper {
             "Note that the || prefix is mandatory",
             "The assistant does not inform the user about using functions and does not explain anything before calling a function.",
             "After calling a function, the raw result appears afterwards and is not part of the conversation",
-            "To make information be part of the conversation, the paraphrases and repeats the information without the function syntax."
+            "To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax."
         ]);
     }
 }
diff --git a/test/modelDependent/llama3/functions.test.ts b/test/modelDependent/llama3/functions.test.ts
index 7ac52ee1..1835a76b 100644
--- a/test/modelDependent/llama3/functions.test.ts
+++ b/test/modelDependent/llama3/functions.test.ts
@@ -1,5 +1,5 @@
 import {describe, expect, test} from "vitest";
-import {defineChatSessionFunction, LlamaChatSession, LlamaJsonSchemaGrammar, Token} from "../../../src/index.js";
+import {defineChatSessionFunction, LlamaChatSession, LlamaJsonSchemaGrammar} from "../../../src/index.js";
 import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
 
@@ -19,13 +19,6 @@ describe("llama 3", () => {
                 contextSequence: context.getSequence()
             });
 
-            const totalTokens = model.fileInfo.metadata.tokenizer.ggml.tokens.length;
-            for (let i = 0; i < totalTokens; i++) {
-                const tokenAttributes = model.getTokenAttributes(i as Token);
-                if (tokenAttributes.control)
-                    console.log("control", model.detokenize([i as Token], true));
-            }
-
             const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
                 functions: {
                     getNthWord: defineChatSessionFunction({
@@ -47,8 +40,6 @@ describe("llama 3", () => {
 
             const res = await chatSession.prompt("What is the second word?", promptOptions);
 
-            console.log(chatSession.sequence.model.detokenize(chatSession.sequence.contextTokens, true));
-
             expect(res).to.be.eq('The second word is "secret".');
 
             const res2 = await chatSession.prompt("Explain what this word means", {

From 0bc454278615c63072fa044d71a66ef746b32769 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 01:48:53 +0300
Subject: [PATCH 26/39] fix: bugs

---
 src/evaluator/LlamaChat/LlamaChat.ts   |  2 +-
 src/evaluator/LlamaModel/LlamaModel.ts | 40 ++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 7b4ce353..1f68cb9e 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -1581,9 +1581,9 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             this.lastHistoryCompressionMetadata = newHistoryCompressionMetadata;
             this.lastContextWindowHistory = contextWindowHistory;
             this.contextWindowLastModelResponse = getLastTextModelResponseFromChatHistory(contextWindowHistory);
+            this.contextWindowsRes = [];
 
             this.canAvoidReloadingHistory = true;
-            this.contextWindowsRes = [];
         }
 
         this.tokens = [
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index a833e614..58ba5ade 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -304,6 +304,15 @@ export class LlamaModel {
 
         if (options === "trimLeadingSpace") {
             if (specialTokens) {
+                const countLeadingSpaces = (text: string) => {
+                    let count = 0;
+                    for (; count < text.length; count++) {
+                        if (text[count] !== " ")
+                            break;
+                    }
+                    return count;
+                };
+                const textLeadingSpaces = countLeadingSpaces(text);
                 const [workaroundToken, workaroundTokenString] = (this.tokens.bos != null && this.tokens.bosString != null)
                     ? [this.tokens.bos, this.tokens.bosString]
                     : (this.tokens.eos != null && this.tokens.eosString != null)
@@ -321,12 +330,39 @@ export class LlamaModel {
                     // only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
                     if (workaroundTokenIndex >= 0 && workaroundTokenIndex <= 1) {
                         tokens.splice(0, workaroundTokenIndex + 1);
-                        return tokens;
+
+                        if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
+                            return tokens;
                     }
                 }
+
+                const workaroundTokensString = "\n";
+                const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, true)) as Token[];
+
+                if (text.startsWith(workaroundTokensString)) {
+                    const tokens = Array.from(this._model.tokenize(text, true)) as Token[];
+                    if (this.detokenize(tokens, true).startsWith(workaroundTokensString))
+                        return tokens;
+                }
+
+                const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, true)) as Token[];
+
+                // only use the tokenized output if it can be corrected, otherwise fallback to the default tokenization
+                if (workaroundTokens.length > 0 && workaroundTokens.every((token, index) => tokens[index] === token)) {
+                    tokens.splice(0, workaroundTokens.length);
+
+                    if (countLeadingSpaces(this.detokenize(tokens, true)) === textLeadingSpaces)
+                        return tokens;
+                }
             } else {
-                const workaroundTokens = Array.from(this._model.tokenize("\n", false)) as Token[];
                 const workaroundTokensString = "\n";
+                const workaroundTokens = Array.from(this._model.tokenize(workaroundTokensString, false)) as Token[];
+
+                if (text.startsWith(workaroundTokensString)) {
+                    const tokens = Array.from(this._model.tokenize(text, false)) as Token[];
+                    if (this.detokenize(tokens, false).startsWith(workaroundTokensString))
+                        return tokens;
+                }
 
                 const tokens = Array.from(this._model.tokenize(workaroundTokensString + text, false)) as Token[];
 

From 8066f08859bb87e5627ba4e2dbdcf371636f46d8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 01:50:35 +0300
Subject: [PATCH 27/39] test: test tokenizer

---
 .../modelDependent/functionary/sanity.test.ts | 96 ++++++++++++++++---
 1 file changed, 84 insertions(+), 12 deletions(-)

diff --git a/test/modelDependent/functionary/sanity.test.ts b/test/modelDependent/functionary/sanity.test.ts
index 1b0ba28c..7f6dfab7 100644
--- a/test/modelDependent/functionary/sanity.test.ts
+++ b/test/modelDependent/functionary/sanity.test.ts
@@ -91,18 +91,90 @@ describe("functionary", () => {
                 modelPath
             });
 
-            const text = "<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
-
-            const tokensWithSpecialTokens = model.tokenize(text, true);
-            const tokensNoSpecialTokens = model.tokenize(text, false);
-
-            expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
-
-            const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
-            const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
-
-            expect(textWithSpecialTokens).to.eql(text);
-            expect(textNoSpecialTokens).to.eql(text);
+            {
+                const text = "<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = " <|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = "  <|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = "\n<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = "\n\n<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = " \n<|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
+            {
+                const text = "\n <|start_header_id|>system<|end_header_id|>\n\nHow much is 6+6\n";
+
+                const tokensWithSpecialTokens = model.tokenize(text, true);
+                const tokensNoSpecialTokens = model.tokenize(text, false);
+                const textWithSpecialTokens = model.detokenize(tokensWithSpecialTokens, true);
+                const textNoSpecialTokens = model.detokenize(tokensNoSpecialTokens, false);
+
+                expect(tokensWithSpecialTokens).to.not.eql(tokensNoSpecialTokens);
+                expect(textWithSpecialTokens).to.eql(text);
+                expect(textNoSpecialTokens).to.eql(text);
+            }
         });
     });
 });

From 9d2959ddee47542a2d7ef29871286f16056dd4fe Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 02:00:46 +0300
Subject: [PATCH 28/39] refactor: make `functionCallMessageTemplate` an object

---
 .../generic/TemplateChatWrapper.ts            |  8 +++----
 .../chatHistoryFunctionCallMessageTemplate.ts | 22 ++++++++++++-------
 .../generic/JinjaTemplateChatWrapper.test.ts  | 16 +++++++-------
 .../generic/TemplateChatWrapper.test.ts       | 16 +++++++-------
 4 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/chatWrappers/generic/TemplateChatWrapper.ts b/src/chatWrappers/generic/TemplateChatWrapper.ts
index 152f73b0..3669ac72 100644
--- a/src/chatWrappers/generic/TemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/TemplateChatWrapper.ts
@@ -27,10 +27,10 @@ type ChatHistoryTemplate = `${string}{{roleName}}${string}{{message}}${string}`;
  *     modelRoleName: "model",
  *     userRoleName: "user",
  *     systemRoleName: "system", // optional
- *     // functionCallMessageTemplate: [ // optional
- *     //     "[[call: {{functionName}}({{functionParams}})]]",
- *     //     " [[result: {{functionCallResult}}]]"
- *     // ]
+ *     // functionCallMessageTemplate: { // optional
+ *     //     call: "[[call: {{functionName}}({{functionParams}})]]",
+ *     //     result: " [[result: {{functionCallResult}}]]"
+ *     // }
  * });
  * ```
  *
diff --git a/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts b/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
index 50fbb08e..ccc1d008 100644
--- a/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
+++ b/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
@@ -1,10 +1,16 @@
 import {parseTextTemplate} from "../../../utils/parseTextTemplate.js";
+import {ChatWrapperSettings} from "../../../types.js";
 
-export function parseFunctionCallMessageTemplate(template?: ChatHistoryFunctionCallMessageTemplate) {
+export function parseFunctionCallMessageTemplate(
+    template?: ChatHistoryFunctionCallMessageTemplate
+): ChatWrapperSettings["functions"] | null {
     if (template == null)
         return null;
 
-    const [functionCallTemplate, functionCallResultTemplate] = template;
+    const {
+        call: functionCallTemplate,
+        result: functionCallResultTemplate
+    } = template;
 
     if (functionCallTemplate == null || functionCallResultTemplate == null)
         throw new Error("Both function call and function call result templates are required");
@@ -62,16 +68,16 @@ export function parseFunctionCallMessageTemplate(template?: ChatHistoryFunctionC
  *
  * For example:
  * ```typescript
- * const template: ChatHistoryFunctionCallMessageTemplate = [
- *     "[[call: {{functionName}}({{functionParams}})]]"
- *     " [[result: {{functionCallResult}}]]"
- * ];
+ * const template: ChatHistoryFunctionCallMessageTemplate = {
+ *     call: "[[call: {{functionName}}({{functionParams}})]]",
+ *     result: " [[result: {{functionCallResult}}]]"
+ * };
  * ```
  *
  * It's mandatory for the call template to have text before `{{functionName}}` in order for the chat wrapper know when
  * to activate the function calling grammar.
  */
-export type ChatHistoryFunctionCallMessageTemplate = [
+export type ChatHistoryFunctionCallMessageTemplate = {
     call: `${string}{{functionName}}${string}{{functionParams}}${string}`,
     result: `${string}{{functionCallResult}}${string}`
-];
+};
diff --git a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
index ca5ebfac..b35417bc 100644
--- a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
@@ -581,10 +581,10 @@ describe("JinjaTemplateChatWrapper", () => {
     test("functions template", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template3,
-            functionCallMessageTemplate: [
-                "[[call: {{functionName}}({{functionParams}})]]",
-                " [[result: {{functionCallResult}}]]"
-            ]
+            functionCallMessageTemplate: {
+                call: "[[call: {{functionName}}({{functionParams}})]]",
+                result: " [[result: {{functionCallResult}}]]"
+            }
         });
         const {contextText} = chatWrapper.generateContextState({
             chatHistory: conversationHistoryWithFunctionCalls,
@@ -659,10 +659,10 @@ describe("JinjaTemplateChatWrapper", () => {
     test("functions template 2", () => {
         const chatWrapper = new JinjaTemplateChatWrapper({
             template: template3,
-            functionCallMessageTemplate: [
-                "\nCall function: {{functionName}} with params {{functionParams}}.",
-                "\nFunction result: {{functionCallResult}}\n"
-            ]
+            functionCallMessageTemplate: {
+                call: "\nCall function: {{functionName}} with params {{functionParams}}.",
+                result: "\nFunction result: {{functionCallResult}}\n"
+            }
         });
         const {contextText} = chatWrapper.generateContextState({
             chatHistory: conversationHistoryWithFunctionCalls,
diff --git a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
index f45c6500..35bfb394 100644
--- a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
@@ -385,10 +385,10 @@ describe("TemplateChatWrapper", () => {
             modelRoleName: "model",
             userRoleName: "user",
             systemRoleName: "system",
-            functionCallMessageTemplate: [
-                "[[call: {{functionName}}({{functionParams}})]]",
-                " [[result: {{functionCallResult}}]]"
-            ]
+            functionCallMessageTemplate: {
+                call: "[[call: {{functionName}}({{functionParams}})]]",
+                result: " [[result: {{functionCallResult}}]]"
+            }
         });
         const {contextText} = chatWrapper.generateContextState({
             chatHistory: conversationHistoryWithFunctionCalls,
@@ -454,10 +454,10 @@ describe("TemplateChatWrapper", () => {
             modelRoleName: "model",
             userRoleName: "user",
             systemRoleName: "system",
-            functionCallMessageTemplate: [
-                "\nCall function: {{functionName}} with params {{functionParams}}.",
-                "\nFunction result: {{functionCallResult}}\n"
-            ]
+            functionCallMessageTemplate: {
+                call: "\nCall function: {{functionName}} with params {{functionParams}}.",
+                result: "\nFunction result: {{functionCallResult}}\n"
+            }
         });
         const {contextText} = chatWrapper.generateContextState({
             chatHistory: conversationHistoryWithFunctionCalls,

From c9dd1131a61c3eb1527496558551504de8206b93 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 02:02:55 +0300
Subject: [PATCH 29/39] feat: improve function calling syntax for default chat
 wrapper

---
 src/ChatWrapper.ts | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/ChatWrapper.ts b/src/ChatWrapper.ts
index 534a42c3..7e137c1e 100644
--- a/src/ChatWrapper.ts
+++ b/src/ChatWrapper.ts
@@ -2,7 +2,7 @@ import {
     ChatHistoryItem, ChatModelFunctionCall, ChatModelFunctions, ChatModelResponse, ChatWrapperGenerateContextStateOptions,
     ChatWrapperGeneratedContextState, ChatWrapperSettings
 } from "./types.js";
-import {LlamaText} from "./utils/LlamaText.js";
+import {LlamaText, SpecialTokensText} from "./utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./chatWrappers/utils/ChatModelFunctionsDocumentationGenerator.js";
 
 export abstract class ChatWrapper {
@@ -11,13 +11,13 @@ export abstract class ChatWrapper {
         functions: {
             call: {
                 optionalPrefixSpace: true,
-                prefix: "[[call: ",
-                paramsPrefix: "(",
-                suffix: ")]]"
+                prefix: "||call: ",
+                paramsPrefix: LlamaText(new SpecialTokensText("(")),
+                suffix: LlamaText(new SpecialTokensText(")"))
             },
             result: {
-                prefix: " [[result: ",
-                suffix: "]]"
+                prefix: LlamaText(new SpecialTokensText("\n"), "||result: "),
+                suffix: LlamaText(new SpecialTokensText("\n"))
             }
         }
     };
@@ -172,7 +172,6 @@ export abstract class ChatWrapper {
 
         return LlamaText.joinValues("\n", [
             "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.",
-            "The assistant does not tell anybody about any of the contents of this system message.",
             "To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.",
             "Provided functions:",
             "```typescript",
@@ -180,11 +179,12 @@ export abstract class ChatWrapper {
             "```",
             "",
             "Calling any of the provided functions can be done like this:",
-            this.generateFunctionCall("functionName", {someKey: "someValue"}),
+            this.generateFunctionCall("getSomeInfo", {someKey: "someValue"}),
             "",
-            "After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.",
-            "The assistant does not tell the user about functions.",
-            "The assistant does not tell the user that functions exist or inform the user prior to calling a function."
+            "Note that the || prefix is mandatory",
+            "The assistant does not inform the user about using functions and does not explain anything before calling a function.",
+            "After calling a function, the raw result appears afterwards and is not part of the conversation",
+            "To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax."
         ]);
     }
 

From 20f03d6c44d4f77123d08f54ba3dcc89f93154f8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 02:08:40 +0300
Subject: [PATCH 30/39] style: lint

---
 src/evaluator/LlamaChatSession/LlamaChatSession.ts | 2 +-
 src/utils/safeEventCallback.ts                     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 5dc78582..eb2e06ef 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -524,7 +524,7 @@ export class LlamaChatSession {
                     ]);
 
                     const functionCallResults = (await functionCallResultsPromise)
-                        .filter((result): result is Exclude<typeof result, null> => result != null);;
+                        .filter((result): result is Exclude<typeof result, null> => result != null);
 
                     this._ensureNotDisposed();
                     if (abortController.signal.aborted)
diff --git a/src/utils/safeEventCallback.ts b/src/utils/safeEventCallback.ts
index 89c99b45..d9cd7f2b 100644
--- a/src/utils/safeEventCallback.ts
+++ b/src/utils/safeEventCallback.ts
@@ -9,11 +9,13 @@ export function safeEventCallback<const Params extends any[]>(
 ): ((...args: Params) => void);
 export function safeEventCallback(callback?: undefined | void | never, message?: string): undefined;
 export function safeEventCallback<const Params extends any[] = any[]>(
-    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) | ((...args: Params) => void | Promise<void>),
+    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) |
+        ((...args: Params) => void | Promise<void>),
     message?: string
 ): undefined | ((...args: Params) => void);
 export function safeEventCallback<const Params extends any[] = any[]>(
-    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) | ((...args: Params) => void | Promise<void>),
+    callback?: undefined | void | never | ((...args: Params) => void) | ((...args: Params) => Promise<void>) |
+        ((...args: Params) => void | Promise<void>),
     message?: string
 ): undefined | ((...args: Params) => void) {
     if (callback == null)

From ea0ba1feb5e84a77ec68a44e2d75a1b655dfe15e Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 02:25:12 +0300
Subject: [PATCH 31/39] fix: bugs

---
 src/chatWrappers/Llama3ChatWrapper.ts         | 11 ++-
 .../LlamaChatSession/LlamaChatSession.ts      | 86 +++++++++++--------
 .../generic/JinjaTemplateChatWrapper.test.ts  | 40 +++++----
 .../generic/TemplateChatWrapper.test.ts       | 40 +++++----
 4 files changed, 107 insertions(+), 70 deletions(-)

diff --git a/src/chatWrappers/Llama3ChatWrapper.ts b/src/chatWrappers/Llama3ChatWrapper.ts
index 9907d420..19969cb0 100644
--- a/src/chatWrappers/Llama3ChatWrapper.ts
+++ b/src/chatWrappers/Llama3ChatWrapper.ts
@@ -13,13 +13,16 @@ export class Llama3ChatWrapper extends ChatWrapper {
     public override readonly settings: ChatWrapperSettings;
 
     public constructor({
-        parallelFunctionCalls = true
+        parallelFunctionCalling = true
     }: {
-        parallelFunctionCalls?: boolean
-    }) {
+        /**
+         * Defaults to `true`
+         */
+        parallelFunctionCalling?: boolean
+    } = {}) {
         super();
 
-        if (parallelFunctionCalls)
+        if (parallelFunctionCalling)
             this.settings = {
                 supportsSystemMessages: true,
                 functions: {
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index eb2e06ef..9b30273f 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -437,6 +437,7 @@ export class LlamaChatSession {
                     functionCallResult: any
                 }>> = [];
                 let canThrowFunctionCallingErrors = false;
+                let abortedOnFunctionCallError = false;
 
                 const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
                 const {
@@ -487,7 +488,10 @@ export class LlamaChatSession {
                                         functionCallResult
                                     };
                                 } catch (err) {
-                                    abortController.abort(err);
+                                    if (!abortController.signal.aborted) {
+                                        abortedOnFunctionCallError = true;
+                                        abortController.abort(err);
+                                    }
 
                                     if (canThrowFunctionCallingErrors)
                                         throw err;
@@ -499,7 +503,7 @@ export class LlamaChatSession {
                     }
                 });
                 this._ensureNotDisposed();
-                if (abortController.signal.aborted)
+                if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
                     throw abortController.signal.reason;
 
                 if (maxTokens != null)
@@ -513,49 +517,59 @@ export class LlamaChatSession {
                     const functionCallResultsPromise = Promise.all(functionCallsAndResults);
                     await Promise.race([
                         functionCallResultsPromise,
-                        new Promise((_, reject) => {
+                        new Promise<void>((accept, reject) => {
                             abortController.signal.addEventListener("abort", () => {
-                                reject(abortController.signal.reason);
+                                if (abortedOnFunctionCallError || !stopOnAbortSignal)
+                                    reject(abortController.signal.reason);
+                                else
+                                    accept();
                             });
 
-                            if (abortController.signal.aborted)
-                                reject(abortController.signal.reason);
+                            if (abortController.signal.aborted) {
+                                if (abortedOnFunctionCallError || !stopOnAbortSignal)
+                                    reject(abortController.signal.reason);
+                                else
+                                    accept();
+                            }
                         })
                     ]);
+                    this._ensureNotDisposed();
 
-                    const functionCallResults = (await functionCallResultsPromise)
-                        .filter((result): result is Exclude<typeof result, null> => result != null);
+                    if (!abortController.signal.aborted) {
+                        const functionCallResults = (await functionCallResultsPromise)
+                            .filter((result): result is Exclude<typeof result, null> => result != null);
+                        this._ensureNotDisposed();
 
-                    this._ensureNotDisposed();
-                    if (abortController.signal.aborted)
-                        throw abortController.signal.reason;
-
-                    newContextWindowChatHistory = lastEvaluation.contextWindow;
-
-                    for (const {functionCall, functionDefinition, functionCallResult} of functionCallResults) {
-                        newChatHistory = addFunctionCallToChatHistory({
-                            chatHistory: newChatHistory,
-                            functionName: functionCall.functionName,
-                            functionDescription: functionDefinition.description,
-                            callParams: functionCall.params,
-                            callResult: functionCallResult,
-                            rawCall: functionCall.raw
-                        });
-
-                        newContextWindowChatHistory = addFunctionCallToChatHistory({
-                            chatHistory: newContextWindowChatHistory,
-                            functionName: functionCall.functionName,
-                            functionDescription: functionDefinition.description,
-                            callParams: functionCall.params,
-                            callResult: functionCallResult,
-                            rawCall: functionCall.raw
-                        });
-                    }
+                        if (abortController.signal.aborted)
+                            throw abortController.signal.reason;
 
-                    lastEvaluation.cleanHistory = newChatHistory;
-                    lastEvaluation.contextWindow = newContextWindowChatHistory;
+                        newContextWindowChatHistory = lastEvaluation.contextWindow;
+
+                        for (const {functionCall, functionDefinition, functionCallResult} of functionCallResults) {
+                            newChatHistory = addFunctionCallToChatHistory({
+                                chatHistory: newChatHistory,
+                                functionName: functionCall.functionName,
+                                functionDescription: functionDefinition.description,
+                                callParams: functionCall.params,
+                                callResult: functionCallResult,
+                                rawCall: functionCall.raw
+                            });
 
-                    continue;
+                            newContextWindowChatHistory = addFunctionCallToChatHistory({
+                                chatHistory: newContextWindowChatHistory,
+                                functionName: functionCall.functionName,
+                                functionDescription: functionDefinition.description,
+                                callParams: functionCall.params,
+                                callResult: functionCallResult,
+                                rawCall: functionCall.raw
+                            });
+                        }
+
+                        lastEvaluation.cleanHistory = newChatHistory;
+                        lastEvaluation.contextWindow = newContextWindowChatHistory;
+
+                        continue;
+                    }
                 }
 
                 this._lastEvaluation = lastEvaluation;
diff --git a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
index b35417bc..6a75fc79 100644
--- a/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/JinjaTemplateChatWrapper.test.ts
@@ -535,7 +535,6 @@ describe("JinjaTemplateChatWrapper", () => {
           If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.
 
           The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -548,11 +547,22 @@ describe("JinjaTemplateChatWrapper", () => {
           \`\`\`
 
           Calling any of the provided functions can be done like this:
-          [[call: functionName({"someKey":"someValue"})]]
+          ||call: getSomeInfo",
+            {
+              "type": "specialTokensText",
+              "value": "(",
+            },
+            "{"someKey":"someValue"}",
+            {
+              "type": "specialTokensText",
+              "value": ")",
+            },
+            "
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.",
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.",
             {
               "type": "specialTokensText",
               "value": "
@@ -604,7 +614,6 @@ describe("JinjaTemplateChatWrapper", () => {
             "### System message
 
           The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -617,11 +626,12 @@ describe("JinjaTemplateChatWrapper", () => {
           \`\`\`
 
           Calling any of the provided functions can be done like this:
-          [[call: functionName({"someKey":"someValue"})]]
+          [[call: getSomeInfo({"someKey":"someValue"})]]
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.
 
           ----
 
@@ -682,7 +692,6 @@ describe("JinjaTemplateChatWrapper", () => {
             "### System message
 
           The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -696,11 +705,12 @@ describe("JinjaTemplateChatWrapper", () => {
 
           Calling any of the provided functions can be done like this:
 
-          Call function: functionName with params {"someKey":"someValue"}.
+          Call function: getSomeInfo with params {"someKey":"someValue"}.
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.
 
           ----
 
diff --git a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
index 35bfb394..fba0f7f6 100644
--- a/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
+++ b/test/standalone/chatWrappers/generic/TemplateChatWrapper.test.ts
@@ -344,7 +344,6 @@ describe("TemplateChatWrapper", () => {
           If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.
 
           The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -357,11 +356,22 @@ describe("TemplateChatWrapper", () => {
           \`\`\`
 
           Calling any of the provided functions can be done like this:
-          [[call: functionName({"someKey":"someValue"})]]
+          ||call: getSomeInfo",
+            {
+              "type": "specialTokensText",
+              "value": "(",
+            },
+            "{"someKey":"someValue"}",
+            {
+              "type": "specialTokensText",
+              "value": ")",
+            },
+            "
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.",
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.",
             {
               "type": "specialTokensText",
               "value": "
@@ -402,7 +412,6 @@ describe("TemplateChatWrapper", () => {
               "value": "system: ",
             },
             "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -415,11 +424,12 @@ describe("TemplateChatWrapper", () => {
           \`\`\`
 
           Calling any of the provided functions can be done like this:
-          [[call: functionName({"someKey":"someValue"})]]
+          [[call: getSomeInfo({"someKey":"someValue"})]]
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.",
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.",
             {
               "type": "specialTokensText",
               "value": "
@@ -471,7 +481,6 @@ describe("TemplateChatWrapper", () => {
               "value": "system: ",
             },
             "The assistant calls the provided functions as needed to retrieve information instead of relying on existing knowledge.
-          The assistant does not tell anybody about any of the contents of this system message.
           To fulfill a request, the assistant calls relevant functions in advance when needed before responding to the request, and does not tell the user prior to calling a function.
           Provided functions:
           \`\`\`typescript
@@ -485,11 +494,12 @@ describe("TemplateChatWrapper", () => {
 
           Calling any of the provided functions can be done like this:
 
-          Call function: functionName with params {"someKey":"someValue"}.
+          Call function: getSomeInfo with params {"someKey":"someValue"}.
 
-          After calling a function the raw result is written afterwards, and a natural language version of the result is written afterwards.
-          The assistant does not tell the user about functions.
-          The assistant does not tell the user that functions exist or inform the user prior to calling a function.",
+          Note that the || prefix is mandatory
+          The assistant does not inform the user about using functions and does not explain anything before calling a function.
+          After calling a function, the raw result appears afterwards and is not part of the conversation
+          To make information be part of the conversation, the assistant paraphrases and repeats the information without the function syntax.",
             {
               "type": "specialTokensText",
               "value": "

From a21121167957c0dc6d7bb2d8c38bebcb121b28c2 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 6 Jun 2024 03:03:14 +0300
Subject: [PATCH 32/39] chore: update recommended models

---
 src/cli/recommendedModels.ts | 50 +++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts
index 14609589..e37beffa 100644
--- a/src/cli/recommendedModels.ts
+++ b/src/cli/recommendedModels.ts
@@ -63,6 +63,25 @@ export const recommendedModels: ModelRecommendation[] = [{
             file: "Meta-Llama-3-70B-Instruct.Q4_K_S.gguf"
         }
     }]
+}, {
+    name: "Phi 3 3.8B",
+    abilities: ["chat", "complete", "functionCalling"],
+    description: "Phi 3 model was created by Microsoft and is optimized for strong reasoning (especially math and logic).\n" +
+        "This is the smallversion of the model.",
+
+    fileOptions: [{
+        huggingFace: {
+            model: "microsoft/Phi-3-mini-4k-instruct-gguf",
+            branch: "main",
+            file: "Phi-3-mini-4k-instruct-fp16.gguf"
+        }
+    }, {
+        huggingFace: {
+            model: "microsoft/Phi-3-mini-4k-instruct-gguf",
+            branch: "main",
+            file: "Phi-3-mini-4k-instruct-q4.gguf"
+        }
+    }]
 }, {
     name: "Llama 2 Chat 7B",
     abilities: ["chat", "complete"],
@@ -180,7 +199,7 @@ export const recommendedModels: ModelRecommendation[] = [{
             file: "dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"
         }
     }]
-}, {
+}, /* {
     name: "Functionary Medium v2.4",
     abilities: ["chat", "complete", "functionCalling"],
     description: "Functionary models were created by Meetkai and are optimized for function calling.\n" +
@@ -199,29 +218,30 @@ export const recommendedModels: ModelRecommendation[] = [{
             file: "functionary-medium-v2.4.Q4_0.gguf"
         }
     }]
-}, {
-    name: "Functionary Small v2.4",
+}, */ {
+    name: "Functionary Small v2.5",
     abilities: ["chat", "complete", "functionCalling"],
     description: "Functionary models were created by Meetkai and are optimized for function calling.\n" +
+        "This model is based on Llama 3.\n" +
         "This is the small version of the model.",
 
     fileOptions: [{
         huggingFace: {
-            model: "meetkai/functionary-small-v2.4-GGUF",
+            model: "meetkai/functionary-small-v2.5-GGUF",
             branch: "main",
-            file: "functionary-small-v2.4.f16.gguf"
+            file: "functionary-small-v2.5.f16.gguf"
         }
     }, {
         huggingFace: {
-            model: "meetkai/functionary-small-v2.4-GGUF",
+            model: "meetkai/functionary-small-v2.5-GGUF",
             branch: "main",
-            file: "functionary-small-v2.4.Q8_0.gguf"
+            file: "functionary-small-v2.5.Q8_0.gguf"
         }
     }, {
         huggingFace: {
-            model: "meetkai/functionary-small-v2.4-GGUF",
+            model: "meetkai/functionary-small-v2.5-GGUF",
             branch: "main",
-            file: "functionary-small-v2.4.Q4_0.gguf"
+            file: "functionary-small-v2.5.Q4_0.gguf"
         }
     }]
 }, {
@@ -284,8 +304,8 @@ export const recommendedModels: ModelRecommendation[] = [{
 }, {
     name: "Code Llama 7B",
     abilities: ["chat", "complete", "infill"],
-    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n"
-        + "This is the 7 billion parameters version of the model.",
+    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n" +
+        "This is the 7 billion parameters version of the model.",
 
     fileOptions: [{
         huggingFace: {
@@ -303,8 +323,8 @@ export const recommendedModels: ModelRecommendation[] = [{
 }, {
     name: "Code Llama 13B",
     abilities: ["chat", "complete", "infill"],
-    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n"
-        + "This is the 13 billion parameters version of the model.",
+    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n" +
+        "This is the 13 billion parameters version of the model.",
 
     fileOptions: [{
         huggingFace: {
@@ -322,8 +342,8 @@ export const recommendedModels: ModelRecommendation[] = [{
 }, {
     name: "Code Llama 34B",
     abilities: ["chat", "complete", "infill"],
-    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n"
-        + "This is the 34 billion parameters version of the model.\n" +
+    description: "Code Llama model was created by Meta based on Llama 2 and is optimized for coding tasks.\n" +
+        "This is the 34 billion parameters version of the model.\n" +
         "You need a GPU with handful of VRAM to use this version.",
 
     fileOptions: [{

From f3d04756727f9fd005f650e57094c4cbfa52a37b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 8 Jun 2024 03:04:58 +0300
Subject: [PATCH 33/39] fix: bugs

---
 src/chatWrappers/FunctionaryChatWrapper.ts    |  2 ++
 src/chatWrappers/Llama3ChatWrapper.ts         |  2 ++
 src/chatWrappers/utils/resolveChatWrapper.ts  |  2 +-
 .../LlamaChatSessionPromptCompletionEngine.ts | 11 +++-----
 src/evaluator/LlamaModel/LlamaModel.ts        | 12 +++++++--
 .../LlamaModel/utils/TokenAttributes.ts       |  2 +-
 src/utils/LlamaText.ts                        | 27 ++++++++++++++-----
 .../functionary/functions.test.ts             |  4 ++-
 8 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/chatWrappers/FunctionaryChatWrapper.ts b/src/chatWrappers/FunctionaryChatWrapper.ts
index 5e6288bc..c5a02143 100644
--- a/src/chatWrappers/FunctionaryChatWrapper.ts
+++ b/src/chatWrappers/FunctionaryChatWrapper.ts
@@ -212,6 +212,8 @@ export class FunctionaryChatWrapper extends ChatWrapper {
             stopGenerationTriggers: [
                 LlamaText(new SpecialToken("EOS")),
                 LlamaText(new SpecialToken("EOT")),
+                LlamaText(new SpecialTokensText("<|eot_id|>")),
+                LlamaText(new SpecialTokensText("<|end_of_text|>")),
                 LlamaText("<|eot_id|>"),
                 LlamaText("<|end_of_text|>")
             ]
diff --git a/src/chatWrappers/Llama3ChatWrapper.ts b/src/chatWrappers/Llama3ChatWrapper.ts
index 19969cb0..5d75d3b6 100644
--- a/src/chatWrappers/Llama3ChatWrapper.ts
+++ b/src/chatWrappers/Llama3ChatWrapper.ts
@@ -184,6 +184,8 @@ export class Llama3ChatWrapper extends ChatWrapper {
             stopGenerationTriggers: [
                 LlamaText(new SpecialToken("EOS")),
                 LlamaText(new SpecialToken("EOT")),
+                LlamaText(new SpecialTokensText("<|eot_id|>")),
+                LlamaText(new SpecialTokensText("<|end_of_text|>")),
                 LlamaText("<|eot_id|>"),
                 LlamaText("<|end_of_text|>")
             ]
diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts
index 6e43f8b2..50900fa6 100644
--- a/src/chatWrappers/utils/resolveChatWrapper.ts
+++ b/src/chatWrappers/utils/resolveChatWrapper.ts
@@ -310,7 +310,7 @@ function orderChatWrapperNamesByAssumedCompatibilityWithModel<T extends Resolvab
         const index = fullText.toLowerCase().indexOf(pattern.toLowerCase());
 
         if (index >= 0)
-            return existsPoints + ((index / fullText.length) * positionPoints);
+            return existsPoints + (((index + 1) / fullText.length) * positionPoints);
 
         return 0;
     }
diff --git a/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
index 09850689..00d424b8 100644
--- a/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
+++ b/src/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.ts
@@ -2,6 +2,7 @@ import {DisposeAggregator, DisposedError} from "lifecycle-utils";
 import {Token} from "../../../types.js";
 import {getConsoleLogPrefix} from "../../../utils/getConsoleLogPrefix.js";
 import {LruCache} from "../../../utils/LruCache.js";
+import {safeEventCallback} from "../../../utils/safeEventCallback.js";
 import type {LLamaChatCompletePromptOptions, LlamaChatSession} from "../LlamaChatSession.js";
 
 export type LLamaChatPromptCompletionEngineOptions = {
@@ -58,7 +59,7 @@ export class LlamaChatSessionPromptCompletionEngine {
         this._chatSession = chatSession;
         this._maxPreloadTokens = Math.max(1, maxPreloadTokens);
         this._maxCachedCompletions = Math.max(1, maxCachedCompletions);
-        this._onGeneration = onGeneration;
+        this._onGeneration = safeEventCallback(onGeneration);
         this._completionOptions = options;
 
         this.dispose = this.dispose.bind(this);
@@ -154,12 +155,8 @@ export class LlamaChatSessionPromptCompletionEngine {
                     return;
                 }
 
-                try {
-                    if (this._lastPrompt === prompt && this._onGeneration != null)
-                        this._onGeneration(prompt, completion);
-                } catch (err) {
-                    console.error(err);
-                }
+                if (this._lastPrompt === prompt)
+                    this._onGeneration?.(prompt, completion);
             }
         })
             .then(() => {
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 58ba5ade..e4aa3133 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -400,12 +400,20 @@ export class LlamaModel {
         return TokenAttributes._create(token, this._model.getTokenAttributes(token));
     }
 
-    /** Check whether the given token is a special token (a control-type token) */
+    /** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
     public isSpecialToken(token: Token | undefined): boolean {
         if (token == null)
             return false;
 
-        return this.getTokenAttributes(token).control;
+        if (this.getTokenAttributes(token).control)
+            return true;
+
+        const normalText = this.detokenize([token], false);
+
+        if (normalText === "")
+            return this.detokenize([token], true) !== "";
+
+        return false;
     }
 
     /** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
diff --git a/src/evaluator/LlamaModel/utils/TokenAttributes.ts b/src/evaluator/LlamaModel/utils/TokenAttributes.ts
index d5c9938e..3da67ae1 100644
--- a/src/evaluator/LlamaModel/utils/TokenAttributes.ts
+++ b/src/evaluator/LlamaModel/utils/TokenAttributes.ts
@@ -25,7 +25,7 @@ export class TokenAttributes {
     }
 
     public get undefined() {
-        return this._hasAttribute(TokenAttribute.undefined);
+        return this._attributes === TokenAttribute.undefined;
     }
 
     public get unknown() {
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index 467dbca1..79d4c280 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -244,9 +244,14 @@ class LlamaText {
 
     /** @internal */
     public [Symbol.for("nodejs.util.inspect.custom")](
-        depth: number | null, inspectOptions: InspectOptions, inspect: typeof InspectFunction
+        depth: number | null, inspectOptions: InspectOptions, inspect?: typeof InspectFunction
     ) {
-        return "LlamaText(" + inspect(this.values, {
+        const inspectFunction = inspect ?? ((inspectOptions as any)?.inspect as undefined | typeof InspectFunction);
+
+        if (inspectFunction == null)
+            return JSON.stringify(this.toJSON(), undefined, 4);
+
+        return "LlamaText(" + inspectFunction(this.values, {
             ...(inspectOptions ?? {}),
             depth: depth == null
                 ? undefined
@@ -467,9 +472,14 @@ export class SpecialTokensText {
 
     /** @internal */
     public [Symbol.for("nodejs.util.inspect.custom")](
-        depth: number | null, inspectOptions: InspectOptions, inspect: typeof InspectFunction
+        depth: number | null, inspectOptions: InspectOptions, inspect?: typeof InspectFunction
     ) {
-        return "new SpecialTokensText(" + inspect(this.value, {
+        const inspectFunction = inspect ?? ((inspectOptions as any)?.inspect as undefined | typeof InspectFunction);
+
+        if (inspectFunction == null)
+            return JSON.stringify(this.toJSON(), undefined, 4);
+
+        return "new SpecialTokensText(" + inspectFunction(this.value, {
             ...(inspectOptions ?? {}),
             depth: depth == null
                 ? undefined
@@ -524,9 +534,14 @@ export class SpecialToken {
 
     /** @internal */
     public [Symbol.for("nodejs.util.inspect.custom")](
-        depth: number | null, inspectOptions: InspectOptions, inspect: typeof InspectFunction
+        depth: number | null, inspectOptions: InspectOptions, inspect?: typeof InspectFunction
     ) {
-        return "new SpecialToken(" + inspect(this.value, {
+        const inspectFunction = inspect ?? ((inspectOptions as any)?.inspect as undefined | typeof InspectFunction);
+
+        if (inspectFunction == null)
+            return JSON.stringify(this.toJSON(), undefined, 4);
+
+        return "new SpecialToken(" + inspectFunction(this.value, {
             ...(inspectOptions ?? {}),
             depth: depth == null
                 ? undefined
diff --git a/test/modelDependent/functionary/functions.test.ts b/test/modelDependent/functionary/functions.test.ts
index fd59f13b..2c84c0ed 100644
--- a/test/modelDependent/functionary/functions.test.ts
+++ b/test/modelDependent/functionary/functions.test.ts
@@ -1,5 +1,5 @@
 import {describe, expect, test} from "vitest";
-import {defineChatSessionFunction, LlamaChatSession, LlamaJsonSchemaGrammar} from "../../../src/index.js";
+import {defineChatSessionFunction, FunctionaryChatWrapper, LlamaChatSession, LlamaJsonSchemaGrammar} from "../../../src/index.js";
 import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
 
@@ -19,6 +19,8 @@ describe("functionary", () => {
                 contextSequence: context.getSequence()
             });
 
+            expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
+
             const promptOptions: Parameters<typeof chatSession.prompt>[1] = {
                 functions: {
                     getNthWord: defineChatSessionFunction({

From 9eef8d4f5e3ebaeb77fa196a32a18af03c2b3824 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 8 Jun 2024 03:42:38 +0300
Subject: [PATCH 34/39] chore: fix CUDA build

---
 .github/workflows/build.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d8f67285..3d463a50 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -110,18 +110,18 @@ jobs:
 
       - name: Install Cuda on Windows
         if: startsWith(matrix.config.os, 'windows')
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.15
         with:
-          cuda: '12.2.0'
+          cuda: '12.4.1'
           method: 'network'
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
           use-local-cache: false
 
       - name: Install Cuda on Ubuntu
         if: startsWith(matrix.config.name, 'Ubuntu GCC')
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.15
         with:
-          cuda: '12.2.0'
+          cuda: '12.4.1'
           method: 'network'
 
       - name: Install Vulkan SDK on Windows

From 22ee91069766d89cc44350583fe59572f4a30701 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 8 Jun 2024 17:37:26 +0300
Subject: [PATCH 35/39] test: fix tests

---
 .../functionary/chatSession.test.ts           | 42 +++++++++++++------
 .../functionary/grammar.test.ts               |  2 +-
 .../modelDependent/functionary/sanity.test.ts |  7 +++-
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/test/modelDependent/functionary/chatSession.test.ts b/test/modelDependent/functionary/chatSession.test.ts
index 536e938d..b63f9824 100644
--- a/test/modelDependent/functionary/chatSession.test.ts
+++ b/test/modelDependent/functionary/chatSession.test.ts
@@ -21,9 +21,12 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            const res = await chatSession.prompt("How much is 6+6");
+            let res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("6 + 6 = 12.");
+            if (res.endsWith("."))
+                res = res.slice(0, -".".length);
+
+            expect(res).to.eql("6 + 6 = 12");
 
             const chatHistory = chatSession.getChatHistory();
 
@@ -33,9 +36,12 @@ describe("functionary", () => {
             });
             chatSession2.setChatHistory(chatHistory);
 
-            const res2 = await chatSession2.prompt("Repeat your answer");
+            let res2 = await chatSession2.prompt("Repeat your answer");
+
+            if (res2.endsWith("."))
+                res2 = res2.slice(0, -".".length);
 
-            expect(res2).to.eql("6 + 6 = 12.");
+            expect(res2).to.eql("6 + 6 = 12");
         });
 
         test("disposing a context sequences removes the current state", {timeout: 1000 * 60 * 60 * 2}, async () => {
@@ -56,9 +62,12 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            const res = await chatSession.prompt("How much is 6+6");
+            let res = await chatSession.prompt("How much is 6+6");
+
+            if (res.endsWith("."))
+                res = res.slice(0, -".".length);
 
-            expect(res).to.eql("6 + 6 = 12.");
+            expect(res).to.eql("6 + 6 = 12");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
@@ -76,7 +85,10 @@ describe("functionary", () => {
                 contextSequence: contextSequence2
             });
 
-            const res2 = await chatSession2.prompt("How much is 6+6+6");
+            let res2 = await chatSession2.prompt("How much is 6+6+6");
+
+            if (res2.endsWith("."))
+                res2 = res2.slice(0, -".".length);
 
             const tokenMeterState2 = contextSequence2.tokenMeter.getState();
             expect(tokenMeterState2).to.toMatchInlineSnapshot(`
@@ -87,7 +99,7 @@ describe("functionary", () => {
               }
             `);
             expect(tokenMeterState2.usedInputTokens).to.be.greaterThanOrEqual(tokenMeterState.usedInputTokens);
-            expect(res2).to.eql("The sum of 6+6+6 is 18.");
+            expect(res2).to.eql("The sum of 6+6+6 is 18");
         });
 
         test("reusing a context sequences utilizes existing state", {timeout: 1000 * 60 * 60 * 2}, async () => {
@@ -108,9 +120,12 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            const res = await chatSession.prompt("How much is 6+6");
+            let res = await chatSession.prompt("How much is 6+6");
+
+            if (res.endsWith("."))
+                res = res.slice(0, -".".length);
 
-            expect(res).to.eql("6 + 6 = 12.");
+            expect(res).to.eql("6 + 6 = 12");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
@@ -125,7 +140,10 @@ describe("functionary", () => {
                 contextSequence
             });
 
-            const res2 = await chatSession2.prompt("How much is 6+6+6");
+            let res2 = await chatSession2.prompt("How much is 6+6+6");
+
+            if (res2.endsWith("."))
+                res2 = res2.slice(0, -".".length);
 
             const tokenMeterStateDiff = contextSequence.tokenMeter.diff(tokenMeterState);
             expect(tokenMeterStateDiff).to.toMatchInlineSnapshot(`
@@ -136,7 +154,7 @@ describe("functionary", () => {
               }
             `);
             expect(tokenMeterStateDiff.usedInputTokens).to.be.lessThan(tokenMeterState.usedInputTokens);
-            expect(res2).to.eql("The sum of 6+6+6 is 18.");
+            expect(res2).to.eql("The sum of 6+6+6 is 18");
         });
     });
 });
diff --git a/test/modelDependent/functionary/grammar.test.ts b/test/modelDependent/functionary/grammar.test.ts
index 7f175bd2..011ff633 100644
--- a/test/modelDependent/functionary/grammar.test.ts
+++ b/test/modelDependent/functionary/grammar.test.ts
@@ -6,7 +6,7 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 describe("functionary", () => {
     describe("grammar", () => {
         describe("JSON schema", () => {
-            test("find verb in message", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            test("find verb in message", {timeout: 1000 * 60 * 60 * 2, retry: 4}, async () => {
                 const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf");
                 const llama = await getTestLlama();
 
diff --git a/test/modelDependent/functionary/sanity.test.ts b/test/modelDependent/functionary/sanity.test.ts
index 7f6dfab7..8c4fae73 100644
--- a/test/modelDependent/functionary/sanity.test.ts
+++ b/test/modelDependent/functionary/sanity.test.ts
@@ -18,9 +18,12 @@ describe("functionary", () => {
                 contextSequence: context.getSequence()
             });
 
-            const res = await chatSession.prompt("How much is 6+6");
+            let res = await chatSession.prompt("How much is 6+6");
 
-            expect(res).to.eql("6 + 6 = 12.");
+            if (res.endsWith("."))
+                res = res.slice(0, -".".length);
+
+            expect(res).to.eql("6 + 6 = 12");
         });
 
         test("text is tokenized with special tokens when appropriate", {timeout: 1000 * 60 * 60 * 2}, async () => {

From cf6242bec1e9ce6a477801256dc4ef5934048d09 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 9 Jun 2024 01:07:03 +0300
Subject: [PATCH 36/39] test: fix tests

---
 .../functionary/chatSession.test.ts           | 50 ++++++-------------
 test/modelDependent/llama3/grammar.test.ts    |  8 +--
 2 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/test/modelDependent/functionary/chatSession.test.ts b/test/modelDependent/functionary/chatSession.test.ts
index b63f9824..ad933c1a 100644
--- a/test/modelDependent/functionary/chatSession.test.ts
+++ b/test/modelDependent/functionary/chatSession.test.ts
@@ -21,12 +21,9 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            let res = await chatSession.prompt("How much is 6+6");
+            const res = await chatSession.prompt("How much is 6+6?");
 
-            if (res.endsWith("."))
-                res = res.slice(0, -".".length);
-
-            expect(res).to.eql("6 + 6 = 12");
+            expect(res).to.eql("6 + 6 = 12.");
 
             const chatHistory = chatSession.getChatHistory();
 
@@ -36,12 +33,9 @@ describe("functionary", () => {
             });
             chatSession2.setChatHistory(chatHistory);
 
-            let res2 = await chatSession2.prompt("Repeat your answer");
-
-            if (res2.endsWith("."))
-                res2 = res2.slice(0, -".".length);
+            const res2 = await chatSession2.prompt("Repeat your answer");
 
-            expect(res2).to.eql("6 + 6 = 12");
+            expect(res2).to.eql("6 + 6 = 12.");
         });
 
         test("disposing a context sequences removes the current state", {timeout: 1000 * 60 * 60 * 2}, async () => {
@@ -62,16 +56,13 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            let res = await chatSession.prompt("How much is 6+6");
+            const res = await chatSession.prompt("How much is 6+6?");
 
-            if (res.endsWith("."))
-                res = res.slice(0, -".".length);
-
-            expect(res).to.eql("6 + 6 = 12");
+            expect(res).to.eql("6 + 6 = 12.");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 80,
+                "usedInputTokens": 81,
                 "usedOutputTokens": 9,
                 "usedRestoreStateTokens": 0,
               }
@@ -85,10 +76,7 @@ describe("functionary", () => {
                 contextSequence: contextSequence2
             });
 
-            let res2 = await chatSession2.prompt("How much is 6+6+6");
-
-            if (res2.endsWith("."))
-                res2 = res2.slice(0, -".".length);
+            const res2 = await chatSession2.prompt("How much is 6+6+6");
 
             const tokenMeterState2 = contextSequence2.tokenMeter.getState();
             expect(tokenMeterState2).to.toMatchInlineSnapshot(`
@@ -99,7 +87,7 @@ describe("functionary", () => {
               }
             `);
             expect(tokenMeterState2.usedInputTokens).to.be.greaterThanOrEqual(tokenMeterState.usedInputTokens);
-            expect(res2).to.eql("The sum of 6+6+6 is 18");
+            expect(res2).to.eql("The sum of 6+6+6 is 18.");
         });
 
         test("reusing a context sequences utilizes existing state", {timeout: 1000 * 60 * 60 * 2}, async () => {
@@ -120,16 +108,13 @@ describe("functionary", () => {
 
             expect(chatSession.chatWrapper).to.be.an.instanceof(FunctionaryChatWrapper);
 
-            let res = await chatSession.prompt("How much is 6+6");
+            const res = await chatSession.prompt("How much is 6+6?");
 
-            if (res.endsWith("."))
-                res = res.slice(0, -".".length);
-
-            expect(res).to.eql("6 + 6 = 12");
+            expect(res).to.eql("6 + 6 = 12.");
             const tokenMeterState = contextSequence.tokenMeter.getState();
             expect(tokenMeterState).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 80,
+                "usedInputTokens": 81,
                 "usedOutputTokens": 9,
                 "usedRestoreStateTokens": 0,
               }
@@ -140,21 +125,18 @@ describe("functionary", () => {
                 contextSequence
             });
 
-            let res2 = await chatSession2.prompt("How much is 6+6+6");
-
-            if (res2.endsWith("."))
-                res2 = res2.slice(0, -".".length);
+            const res2 = await chatSession2.prompt("How much is 6+6+6?");
 
             const tokenMeterStateDiff = contextSequence.tokenMeter.diff(tokenMeterState);
             expect(tokenMeterStateDiff).to.toMatchInlineSnapshot(`
               {
-                "usedInputTokens": 6,
-                "usedOutputTokens": 14,
+                "usedInputTokens": 7,
+                "usedOutputTokens": 11,
                 "usedRestoreStateTokens": 0,
               }
             `);
             expect(tokenMeterStateDiff.usedInputTokens).to.be.lessThan(tokenMeterState.usedInputTokens);
-            expect(res2).to.eql("The sum of 6+6+6 is 18");
+            expect(res2).to.eql("6 + 6 + 6 = 18");
         });
     });
 });
diff --git a/test/modelDependent/llama3/grammar.test.ts b/test/modelDependent/llama3/grammar.test.ts
index 30959b61..3b1e4a54 100644
--- a/test/modelDependent/llama3/grammar.test.ts
+++ b/test/modelDependent/llama3/grammar.test.ts
@@ -26,7 +26,7 @@ describe("llama 3", () => {
                         "userMessagePositivityScoreFromOneToTen": {
                             enum: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                         },
-                        "positiveAdjectiveWordsInUserMessage": {
+                        "positiveWordsInUserMessage": {
                             type: "array",
                             items: {
                                 type: "string"
@@ -35,13 +35,13 @@ describe("llama 3", () => {
                     }
                 } as const);
 
-                const res = await chatSession.prompt("How's your great day going?", {
+                const res = await chatSession.prompt("It's great!", {
                     grammar
                 });
                 const parsedRes = grammar.parse(res);
 
-                expect(parsedRes.userMessagePositivityScoreFromOneToTen).to.eq(9);
-                expect(parsedRes.positiveAdjectiveWordsInUserMessage).to.eql(["great"]);
+                expect(parsedRes.userMessagePositivityScoreFromOneToTen).to.eq(10);
+                expect(parsedRes.positiveWordsInUserMessage).to.eql(["great"]);
             });
 
             test("get an array of numbers", {timeout: 1000 * 60 * 60 * 2}, async () => {

From 57d7392ee4b37f0cc98cdbd009652bd2889d4788 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 9 Jun 2024 01:36:54 +0300
Subject: [PATCH 37/39] feat: `customStopTriggers` for `LlamaCompletion`

---
 src/evaluator/LlamaChat/LlamaChat.ts |  2 +-
 src/evaluator/LlamaCompletion.ts     | 72 ++++++++++++++++++++--------
 src/index.ts                         |  4 +-
 src/utils/StopGenerationDetector.ts  |  4 +-
 4 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 1f68cb9e..43763e6d 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -117,7 +117,7 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
     /**
      * Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
      */
-    customStopTriggers?: (LlamaText | string | (string | Token)[])[],
+    customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[],
 
     /**
      * The evaluation context window returned from the last evaluation.
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index 0198cb65..766f644f 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -5,7 +5,7 @@ import {tokenizeInput} from "../utils/tokenizeInput.js";
 import {UnsupportedError} from "../utils/UnsupportedError.js";
 import {removeNullFields} from "../utils/removeNullFields.js";
 import {QueuedTokenReleaseLock, TokenStreamRegulator} from "../utils/TokenStreamRegulator.js";
-import {StopGenerationDetector, StopGenerationTrigger} from "../utils/StopGenerationDetector.js";
+import {StopGenerationDetector} from "../utils/StopGenerationDetector.js";
 import {UNKNOWN_UNICODE_CHAR} from "../consts.js";
 import {getQueuedTokensBeforeStopTrigger} from "../utils/getQueuedTokensBeforeStopTrigger.js";
 import {safeEventCallback} from "../utils/safeEventCallback.js";
@@ -92,8 +92,10 @@ export type LlamaCompletionGenerationOptions = {
 
     grammar?: LlamaGrammar,
 
-    /** Consider any of these as EOS for the generated text */
-    stopGenerationTriggers?: readonly (StopGenerationTrigger | LlamaText)[],
+    /**
+     * Custom stop triggers to stop the completion when any of the provided triggers are found.
+     */
+    customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[],
 
     /**
      * The number of tokens to delete from the context window to make space for new ones.
@@ -121,6 +123,18 @@ export type LlamaInfillGenerationOptions = LlamaCompletionGenerationOptions & {
     minPrefixKeepTokens?: number | ((sequence: LlamaContextSequence) => number | Promise<number>)
 };
 
+export type LlamaCompletionResponse = {
+    response: string,
+    metadata: {
+        remainingGenerationAfterStop?: string | Token[],
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens"
+    } | {
+        remainingGenerationAfterStop?: string | Token[],
+        stopReason: "customStopTrigger",
+        customStopTrigger: (string | Token)[]
+    }
+};
+
 const defaultContextShiftSize = (
     (sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10))
 ) satisfies LlamaCompletionGenerationOptions["contextShiftSize"];
@@ -207,11 +221,11 @@ export class LlamaCompletion {
             tokenBias,
             evaluationPriority = 5,
             grammar,
-            stopGenerationTriggers,
+            customStopTriggers,
             contextShiftSize = defaultContextShiftSize,
             disableContextShift
         }: LlamaCompletionGenerationOptions = {}
-    ) {
+    ): Promise<LlamaCompletionResponse> {
         if (this._sequence == null || this.disposed)
             throw new DisposedError();
 
@@ -288,7 +302,7 @@ export class LlamaCompletion {
                 evaluationPriority,
                 grammar,
                 contextShiftSize,
-                stopGenerationTriggers
+                customStopTriggers
             }, {
                 async contextShift({shiftSize, res, pendingTokens, sequence}): Promise<{
                     newContextState: Token[]
@@ -341,11 +355,11 @@ export class LlamaCompletion {
             evaluationPriority = 5,
             grammar,
             contextShiftSize = defaultContextShiftSize,
-            stopGenerationTriggers,
+            customStopTriggers,
             minPrefixKeepTokens = defaultMinPrefixKeepTokens,
             disableContextShift = false
         }: LlamaInfillGenerationOptions = {}
-    ) {
+    ): Promise<LlamaCompletionResponse> {
         if (this._sequence == null || this.disposed)
             throw new DisposedError();
 
@@ -471,7 +485,7 @@ export class LlamaCompletion {
                 evaluationPriority,
                 grammar,
                 contextShiftSize,
-                stopGenerationTriggers
+                customStopTriggers
             }, {
                 async contextShift({shiftSize, res, pendingTokens, sequence}): Promise<{
                     newContextState: Token[]
@@ -506,7 +520,7 @@ export class LlamaCompletion {
             evaluationPriority = 5,
             grammar,
             contextShiftSize = defaultContextShiftSize,
-            stopGenerationTriggers
+            customStopTriggers
         }: LlamaCompletionGenerationOptions,
         {
             contextShift
@@ -518,7 +532,7 @@ export class LlamaCompletion {
                 sequence: LlamaContextSequence
             }): Promise<{newContextState: Token[]}>
         }
-    ) {
+    ): Promise<LlamaCompletionResponse> {
         if (this._sequence == null)
             throw new DisposedError();
 
@@ -543,6 +557,7 @@ export class LlamaCompletion {
             : repeatPenalty;
         const streamRegulator = new TokenStreamRegulator();
         const stopGenerationDetector = new StopGenerationDetector();
+        const customStopGenerationTriggersDetector = new StopGenerationDetector();
         const locksToReleaseOnValidGeneration: QueuedTokenReleaseLock[] = [];
         const repeatPenaltyEnabled = repeatPenaltyLastTokens > 0;
 
@@ -553,9 +568,9 @@ export class LlamaCompletion {
             StopGenerationDetector.resolveStopTriggers(grammar.stopGenerationTriggers, model.tokenizer)
                 .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
 
-        if (stopGenerationTriggers != null)
-            StopGenerationDetector.resolveStopTriggers(stopGenerationTriggers, model.tokenizer)
-                .map((stopTrigger) => stopGenerationDetector.addStopTrigger(stopTrigger));
+        if (customStopTriggers != null)
+            StopGenerationDetector.resolveStopTriggers(customStopTriggers, model.tokenizer)
+                .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
 
         const ensureNotAborted = () => {
             if (signal?.aborted)
@@ -638,11 +653,16 @@ export class LlamaCompletion {
                 }
 
                 stopGenerationDetector.recordGeneration({text, tokens, queuedTokenRelease});
+                customStopGenerationTriggersDetector.recordGeneration({text, tokens, queuedTokenRelease});
 
                 pendingTokens.push(...streamRegulator.popFreeChunkTokens());
 
-                if (stopGenerationDetector.hasTriggeredStops || model.isEogToken(token)) {
-                    const triggeredStops  = stopGenerationDetector.getTriggeredStops();
+                if (stopGenerationDetector.hasTriggeredStops || customStopGenerationTriggersDetector.hasTriggeredStops ||
+                    model.isEogToken(token)
+                ) {
+                    const triggeredStops  = stopGenerationDetector.hasTriggeredStops
+                        ? stopGenerationDetector.getTriggeredStops()
+                        : customStopGenerationTriggersDetector.getTriggeredStops();
                     const partiallyFreeTokens = streamRegulator.getPartiallyFreeChunk(model.tokenizer);
 
                     const queuedTokensBeforeStopTrigger = getQueuedTokensBeforeStopTrigger(
@@ -665,13 +685,25 @@ export class LlamaCompletion {
                     if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
                         modelResponse = modelResponse.trimEnd();
 
+                    const isEogToken = model.isEogToken(token);
+
+                    if (isEogToken || stopGenerationDetector.hasTriggeredStops)
+                        return {
+                            response: modelResponse,
+                            metadata: {
+                                remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
+                                stopReason: isEogToken
+                                    ? "eogToken"
+                                    : "stopGenerationTrigger"
+                            }
+                        };
+
                     return {
                         response: modelResponse,
                         metadata: {
                             remainingGenerationAfterStop: firstRemainingGenerationAfterStop,
-                            stopReason: model.isEogToken(token)
-                                ? "eogToken" as const
-                                : "stopGenerationTrigger" as const
+                            stopReason: "customStopTrigger",
+                            customStopTrigger: triggeredStops[0].stopTrigger
                         }
                     };
                 }
@@ -691,7 +723,7 @@ export class LlamaCompletion {
                     return {
                         response: modelResponse,
                         metadata: {
-                            stopReason: "maxTokens" as const
+                            stopReason: "maxTokens"
                         }
                     };
                 }
diff --git a/src/index.ts b/src/index.ts
index b51a9b6d..c8491e5b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -30,7 +30,8 @@ import {
     LlamaChatSessionPromptCompletionEngine, type LLamaChatPromptCompletionEngineOptions
 } from "./evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js";
 import {
-    LlamaCompletion, type LlamaCompletionOptions, type LlamaCompletionGenerationOptions, type LlamaInfillGenerationOptions
+    LlamaCompletion, type LlamaCompletionOptions, type LlamaCompletionGenerationOptions, type LlamaInfillGenerationOptions,
+    type LlamaCompletionResponse
 } from "./evaluator/LlamaCompletion.js";
 import {TokenMeter, type TokenMeterState} from "./evaluator/TokenMeter.js";
 import {UnsupportedError} from "./utils/UnsupportedError.js";
@@ -138,6 +139,7 @@ export {
     type LlamaCompletionOptions,
     type LlamaCompletionGenerationOptions,
     type LlamaInfillGenerationOptions,
+    type LlamaCompletionResponse,
     TokenMeter,
     type TokenMeterState,
     UnsupportedError,
diff --git a/src/utils/StopGenerationDetector.ts b/src/utils/StopGenerationDetector.ts
index aff70b9f..a38ac396 100644
--- a/src/utils/StopGenerationDetector.ts
+++ b/src/utils/StopGenerationDetector.ts
@@ -302,7 +302,7 @@ export class StopGenerationDetector<T extends string = string> {
     }
 
     public static resolveStopTriggers(
-        stopTriggers: readonly (string | StopGenerationTrigger | LlamaText)[],
+        stopTriggers: readonly (string | Readonly<StopGenerationTrigger> | LlamaText)[],
         tokenizer: Tokenizer
     ) {
         return stopTriggers
@@ -362,7 +362,7 @@ export class StopGenerationDetector<T extends string = string> {
     }
 }
 
-function simplifyStopTrigger(stopTrigger: StopGenerationTrigger): StopGenerationTrigger {
+function simplifyStopTrigger(stopTrigger: Readonly<StopGenerationTrigger>): StopGenerationTrigger {
     let text = "";
     const res: StopGenerationTrigger = [];
 

From 83e86131bbc1b6073a1000c5b7ff1923165253d3 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 9 Jun 2024 02:42:01 +0300
Subject: [PATCH 38/39] fix: export all referenced types

---
 .config/typedoc.json                          |  3 ++-
 src/apiDocsIndex.ts                           | 12 ++++++++++
 src/apiDocsOverrides.ts                       | 24 -------------------
 src/chatWrappers/FunctionaryChatWrapper.ts    |  6 ++---
 .../generic/JinjaTemplateChatWrapper.ts       | 10 ++++----
 .../generic/TemplateChatWrapper.ts            | 23 ++++++++----------
 .../chatHistoryFunctionCallMessageTemplate.ts |  2 +-
 src/evaluator/LlamaGrammar.ts                 |  6 ++---
 src/index.ts                                  | 22 +++++++++++++----
 src/types.ts                                  |  8 +++----
 src/utils/gbnfJson/types.ts                   |  3 +++
 11 files changed, 59 insertions(+), 60 deletions(-)
 create mode 100644 src/apiDocsIndex.ts
 delete mode 100644 src/apiDocsOverrides.ts

diff --git a/.config/typedoc.json b/.config/typedoc.json
index 23b415d7..c43ac2aa 100644
--- a/.config/typedoc.json
+++ b/.config/typedoc.json
@@ -24,5 +24,6 @@
     "enumMembersFormat": "table",
     "typeDeclarationFormat": "list",
     "sort": ["source-order"],
-    "docsRoot": "../docs"
+    "docsRoot": "../docs",
+    "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"]
 }
diff --git a/src/apiDocsIndex.ts b/src/apiDocsIndex.ts
new file mode 100644
index 00000000..71b17429
--- /dev/null
+++ b/src/apiDocsIndex.ts
@@ -0,0 +1,12 @@
+/** @internal */
+import {
+    _LlamaText
+} from "./utils/LlamaText.js";
+
+/** @internal */
+export * from "./index.js";
+
+/** @internal */
+export {
+    _LlamaText as LlamaText
+};
diff --git a/src/apiDocsOverrides.ts b/src/apiDocsOverrides.ts
deleted file mode 100644
index 6cc16290..00000000
--- a/src/apiDocsOverrides.ts
+++ /dev/null
@@ -1,24 +0,0 @@
-/** @internal */
-import {Tokenizer} from "./types.js";
-/** @internal */
-import {
-    _LlamaText,
-    type LlamaTextJSON,
-    type LlamaTextJSONValue,
-    type LlamaTextSpecialTokenJSON,
-    type LlamaTextSpecialTokensTextJSON,
-    type LlamaTextValue,
-    type LlamaTextInputValue
-} from "./utils/LlamaText.js";
-
-/** @internal */
-export {
-    _LlamaText as LlamaText,
-    type Tokenizer,
-    type LlamaTextJSON,
-    type LlamaTextJSONValue,
-    type LlamaTextSpecialTokensTextJSON,
-    type LlamaTextSpecialTokenJSON,
-    type LlamaTextValue,
-    type LlamaTextInputValue
-};
diff --git a/src/chatWrappers/FunctionaryChatWrapper.ts b/src/chatWrappers/FunctionaryChatWrapper.ts
index c5a02143..52a43a98 100644
--- a/src/chatWrappers/FunctionaryChatWrapper.ts
+++ b/src/chatWrappers/FunctionaryChatWrapper.ts
@@ -6,19 +6,17 @@ import {
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 import {ChatModelFunctionsDocumentationGenerator} from "./utils/ChatModelFunctionsDocumentationGenerator.js";
 
-type FunctionaryChatWrapperVariation = "v2" | "v2.llama3";
-
 // source: https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v2.txt
 export class FunctionaryChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Functionary";
-    public readonly variation: FunctionaryChatWrapperVariation;
+    public readonly variation: "v2" | "v2.llama3";
 
     public override readonly settings: ChatWrapperSettings;
 
     public constructor({
         variation = "v2.llama3"
     }: {
-        variation?: FunctionaryChatWrapperVariation
+        variation?: "v2" | "v2.llama3"
     } = {}) {
         super();
 
diff --git a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
index ef3a0e99..0328eb20 100644
--- a/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/JinjaTemplateChatWrapper.ts
@@ -28,7 +28,7 @@ export type JinjaTemplateChatWrapperOptions = {
      *
      * Defaults to `"auto"`.
      */
-    convertUnsupportedSystemMessagesToUserMessages?: "auto" | boolean | ConvertMessageFormatOptions,
+    convertUnsupportedSystemMessagesToUserMessages?: "auto" | boolean | JinjaTemplateChatWrapperOptionsConvertMessageFormat,
     functionCallMessageTemplate?: ChatHistoryFunctionCallMessageTemplate,
 
     /**
@@ -47,12 +47,12 @@ export type JinjaTemplateChatWrapperOptions = {
     trimLeadingWhitespaceInResponses?: boolean
 };
 
-type ConvertMessageFormatOptions = {
+export type JinjaTemplateChatWrapperOptionsConvertMessageFormat = {
     use?: "always" | "ifNeeded",
     format: `${string}{{message}}${string}`
 };
 
-const defaultConvertUnsupportedSystemMessagesToUserMessagesFormat: ConvertMessageFormatOptions = {
+const defaultConvertUnsupportedSystemMessagesToUserMessagesFormat: JinjaTemplateChatWrapperOptionsConvertMessageFormat = {
     format: "### System message\n\n{{message}}\n\n----"
 };
 
@@ -73,7 +73,7 @@ export class JinjaTemplateChatWrapper extends ChatWrapper {
     public readonly modelRoleName: string;
     public readonly userRoleName: string;
     public readonly systemRoleName: string;
-    public readonly convertUnsupportedSystemMessagesToUserMessages?: ConvertMessageFormatOptions;
+    public readonly convertUnsupportedSystemMessagesToUserMessages?: JinjaTemplateChatWrapperOptionsConvertMessageFormat;
     public readonly joinAdjacentMessagesOfTheSameType: boolean;
     public readonly trimLeadingWhitespaceInResponses: boolean;
 
@@ -443,7 +443,7 @@ class UniqueTemplateId {
 
 function resolveConvertUnsupportedSystemMessagesToUserMessagesOption(
     convertUnsupportedSystemMessagesToUserMessages?: JinjaTemplateChatWrapperOptions["convertUnsupportedSystemMessagesToUserMessages"]
-): ConvertMessageFormatOptions | undefined {
+): JinjaTemplateChatWrapperOptionsConvertMessageFormat | undefined {
     if (convertUnsupportedSystemMessagesToUserMessages === false)
         return undefined;
 
diff --git a/src/chatWrappers/generic/TemplateChatWrapper.ts b/src/chatWrappers/generic/TemplateChatWrapper.ts
index 3669ac72..c946c48b 100644
--- a/src/chatWrappers/generic/TemplateChatWrapper.ts
+++ b/src/chatWrappers/generic/TemplateChatWrapper.ts
@@ -5,8 +5,8 @@ import {parseTextTemplate} from "../../utils/parseTextTemplate.js";
 import {ChatHistoryFunctionCallMessageTemplate, parseFunctionCallMessageTemplate} from "./utils/chatHistoryFunctionCallMessageTemplate.js";
 
 export type TemplateChatWrapperOptions = {
-    template: ChatTemplate,
-    historyTemplate: ChatHistoryTemplate,
+    template: `${"" | `${string}{{systemPrompt}}`}${string}{{history}}${string}{{completion}}${string}`,
+    historyTemplate: `${string}{{roleName}}${string}{{message}}${string}`,
     modelRoleName: string,
     userRoleName: string,
     systemRoleName?: string,
@@ -14,9 +14,6 @@ export type TemplateChatWrapperOptions = {
     joinAdjacentMessagesOfTheSameType?: boolean
 };
 
-type ChatTemplate = `${`${string}{{systemPrompt}}` | ""}${string}{{history}}${string}{{completion}}${string}`;
-type ChatHistoryTemplate = `${string}{{roleName}}${string}{{message}}${string}`;
-
 /**
  * A chat wrapper based on a simple template.
  * @example
@@ -34,14 +31,14 @@ type ChatHistoryTemplate = `${string}{{roleName}}${string}{{message}}${string}`;
  * });
  * ```
  *
- * **`{{systemPrompt}}`** is optional and is replaced with the first system message
+ * **<span v-pre>`{{systemPrompt}}`</span>** is optional and is replaced with the first system message
  * (when is does, that system message is not included in the history).
  *
- * **`{{history}}`** is replaced with the chat history.
+ * **<span v-pre>`{{history}}`</span>** is replaced with the chat history.
  * Each message in the chat history is converted using template passed to `historyTemplate`, and all messages are joined together.
  *
- * **`{{completion}}`** is where the model's response is generated.
- * The text that comes after `{{completion}}` is used to determine when the model has finished generating the response,
+ * **<span v-pre>`{{completion}}`</span>** is where the model's response is generated.
+ * The text that comes after <span v-pre>`{{completion}}`</span> is used to determine when the model has finished generating the response,
  * and thus is mandatory.
  *
  * **`functionCallMessageTemplate`** is used to specify the format in which functions can be called by the model and
@@ -51,8 +48,8 @@ export class TemplateChatWrapper extends ChatWrapper {
     public readonly wrapperName = "Template";
     public override readonly settings: ChatWrapperSettings;
 
-    public readonly template: ChatTemplate;
-    public readonly historyTemplate: ChatHistoryTemplate;
+    public readonly template: TemplateChatWrapperOptions["template"];
+    public readonly historyTemplate: TemplateChatWrapperOptions["historyTemplate"];
     public readonly modelRoleName: string;
     public readonly userRoleName: string;
     public readonly systemRoleName: string;
@@ -223,7 +220,7 @@ export class TemplateChatWrapper extends ChatWrapper {
     }
 }
 
-function parseChatTemplate(template: ChatTemplate): {
+function parseChatTemplate(template: TemplateChatWrapperOptions["template"]): {
     systemPromptPrefix: string | null,
     historyPrefix: string,
     completionPrefix: string,
@@ -252,7 +249,7 @@ function parseChatTemplate(template: ChatTemplate): {
     };
 }
 
-function parseChatHistoryTemplate(template: ChatHistoryTemplate): {
+function parseChatHistoryTemplate(template: TemplateChatWrapperOptions["historyTemplate"]): {
     roleNamePrefix: string,
     messagePrefix: string,
     messageSuffix: string
diff --git a/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts b/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
index ccc1d008..43337b07 100644
--- a/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
+++ b/src/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.ts
@@ -74,7 +74,7 @@ export function parseFunctionCallMessageTemplate(
  * };
  * ```
  *
- * It's mandatory for the call template to have text before `{{functionName}}` in order for the chat wrapper know when
+ * It's mandatory for the call template to have text before <span v-pre>`{{functionName}}`</span> in order for the chat wrapper know when
  * to activate the function calling grammar.
  */
 export type ChatHistoryFunctionCallMessageTemplate = {
diff --git a/src/evaluator/LlamaGrammar.ts b/src/evaluator/LlamaGrammar.ts
index 494c45a4..4af0b2ac 100644
--- a/src/evaluator/LlamaGrammar.ts
+++ b/src/evaluator/LlamaGrammar.ts
@@ -2,9 +2,9 @@ import path from "path";
 import fs from "fs-extra";
 import {getGrammarsFolder} from "../utils/getGrammarsFolder.js";
 import {LlamaText} from "../utils/LlamaText.js";
-import {StopGenerationTrigger} from "../utils/StopGenerationDetector.js";
 import {AddonGrammar} from "../bindings/AddonTypes.js";
 import {Llama} from "../bindings/Llama.js";
+import {Token} from "../types.js";
 
 
 export type LlamaGrammarOptions = {
@@ -15,7 +15,7 @@ export type LlamaGrammarOptions = {
     printGrammar?: boolean,
 
     /** Consider any of these as EOS for the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
-    stopGenerationTriggers?: readonly (StopGenerationTrigger | LlamaText)[],
+    stopGenerationTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[],
 
     /** Trim whitespace from the end of the generated text. Only supported by `LlamaChat` and `LlamaChatSession` */
     trimWhitespaceSuffix?: boolean
@@ -24,7 +24,7 @@ export type LlamaGrammarOptions = {
 export class LlamaGrammar {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _grammar: AddonGrammar;
-    private readonly _stopGenerationTriggers: readonly (StopGenerationTrigger | LlamaText)[];
+    private readonly _stopGenerationTriggers: readonly (LlamaText | string | readonly (string | Token)[])[];
     private readonly _trimWhitespaceSuffix: boolean;
     private readonly _grammarText: string;
 
diff --git a/src/index.ts b/src/index.ts
index c8491e5b..ac90bc3d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,6 +1,6 @@
 import {DisposedError} from "lifecycle-utils";
 import {Llama} from "./bindings/Llama.js";
-import {getLlama, LlamaOptions} from "./bindings/getLlama.js";
+import {getLlama, type LlamaOptions, type LastBuildOptions} from "./bindings/getLlama.js";
 import {NoBinaryFoundError} from "./bindings/utils/NoBinaryFoundError.js";
 import {LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType} from "./bindings/types.js";
 import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel/LlamaModel.js";
@@ -19,7 +19,7 @@ import {
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
     LlamaChatSession, type LlamaChatSessionOptions, type LlamaChatSessionContextShiftOptions,
-    type LLamaChatPromptOptions, type LLamaChatCompletePromptOptions, type LlamaChatSessionRepeatPenalty
+    type LLamaChatPromptOptions, type LLamaChatCompletePromptOptions, type LlamaChatSessionRepeatPenalty, type LLamaChatPreloadPromptOptions
 } from "./evaluator/LlamaChatSession/LlamaChatSession.js";
 import {defineChatSessionFunction} from "./evaluator/LlamaChatSession/utils/defineChatSessionFunction.js";
 import {
@@ -47,7 +47,10 @@ import {AlpacaChatWrapper} from "./chatWrappers/AlpacaChatWrapper.js";
 import {FunctionaryChatWrapper} from "./chatWrappers/FunctionaryChatWrapper.js";
 import {GemmaChatWrapper} from "./chatWrappers/GemmaChatWrapper.js";
 import {TemplateChatWrapper, type TemplateChatWrapperOptions} from "./chatWrappers/generic/TemplateChatWrapper.js";
-import {JinjaTemplateChatWrapper, type JinjaTemplateChatWrapperOptions} from "./chatWrappers/generic/JinjaTemplateChatWrapper.js";
+import {
+    JinjaTemplateChatWrapper, type JinjaTemplateChatWrapperOptions, type JinjaTemplateChatWrapperOptionsConvertMessageFormat
+} from "./chatWrappers/generic/JinjaTemplateChatWrapper.js";
+import {ChatHistoryFunctionCallMessageTemplate} from "./chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js";
 import {
     resolvableChatWrapperTypeNames, type ResolvableChatWrapperTypeName, specializedChatWrapperTypeNames,
     type SpecializedChatWrapperTypeName, templateChatWrapperTypeNames, type TemplateChatWrapperTypeName, resolveChatWrapper,
@@ -56,12 +59,14 @@ import {
 import {ChatModelFunctionsDocumentationGenerator} from "./chatWrappers/utils/ChatModelFunctionsDocumentationGenerator.js";
 import {
     LlamaText, SpecialTokensText, SpecialToken, isLlamaText, tokenizeText, type LlamaTextValue, type LlamaTextInputValue,
-    type LlamaTextJSON, type LlamaTextJSONValue, type LlamaTextSpecialTokensTextJSON, type LlamaTextSpecialTokenJSON
+    type LlamaTextJSON, type LlamaTextJSONValue, type LlamaTextSpecialTokensTextJSON, type LlamaTextSpecialTokenJSON,
+    type BuiltinSpecialTokenValue
 } from "./utils/LlamaText.js";
 import {appendUserMessageToChatHistory} from "./utils/appendUserMessageToChatHistory.js";
 import {getModuleVersion} from "./utils/getModuleVersion.js";
 import {readGgufFileInfo} from "./gguf/readGgufFileInfo.js";
-import {GgufInsights} from "./gguf/insights/GgufInsights.js";
+import {GgufInsights, type GgufInsightsResourceRequirements} from "./gguf/insights/GgufInsights.js";
+import {GgufInsightsConfigurationResolver} from "./gguf/insights/GgufInsightsConfigurationResolver.js";
 import {createModelDownloader, ModelDownloader, type ModelDownloaderOptions} from "./utils/createModelDownloader.js";
 
 import {
@@ -88,6 +93,7 @@ export {
     Llama,
     getLlama,
     type LlamaOptions,
+    type LastBuildOptions,
     LlamaLogLevel,
     NoBinaryFoundError,
     LlamaModel,
@@ -124,6 +130,7 @@ export {
     type LLamaChatPromptOptions,
     type LLamaChatCompletePromptOptions,
     type LlamaChatSessionRepeatPenalty,
+    type LLamaChatPreloadPromptOptions,
     LlamaChat,
     type LlamaChatOptions,
     type LLamaChatGenerateResponseOptions,
@@ -162,6 +169,8 @@ export {
     type TemplateChatWrapperOptions,
     JinjaTemplateChatWrapper,
     type JinjaTemplateChatWrapperOptions,
+    type JinjaTemplateChatWrapperOptionsConvertMessageFormat,
+    type ChatHistoryFunctionCallMessageTemplate,
     resolveChatWrapper,
     type ResolveChatWrapperOptions,
     resolvableChatWrapperTypeNames,
@@ -182,6 +191,7 @@ export {
     type LlamaTextJSONValue,
     type LlamaTextSpecialTokensTextJSON,
     type LlamaTextSpecialTokenJSON,
+    type BuiltinSpecialTokenValue,
     appendUserMessageToChatHistory,
     getModuleVersion,
     type ChatHistoryItem,
@@ -232,6 +242,8 @@ export {
     GgmlType,
     isGgufMetadataOfArchitectureType,
     GgufInsights,
+    type GgufInsightsResourceRequirements,
+    GgufInsightsConfigurationResolver,
     createModelDownloader,
     ModelDownloader,
     type ModelDownloaderOptions
diff --git a/src/types.ts b/src/types.ts
index 306b6250..91f44d58 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -30,8 +30,8 @@ export type ChatWrapperSettings = {
         readonly result: {
             /**
              * Supported template parameters:
-             * - `{{functionName}}`
-             * - `{{functionParams}}`
+             * - <span v-pre>`{{functionName}}`</span>
+             * - <span v-pre>`{{functionParams}}`</span>
              *
              * Template parameters can only appear in a string or a string in a `LlamaText`.
              *
@@ -48,8 +48,8 @@ export type ChatWrapperSettings = {
 
             /**
              * Supported template parameters:
-             * - `{{functionName}}`
-             * - `{{functionParams}}`
+             * - <span v-pre>`{{functionName}}`</span>
+             * - <span v-pre>`{{functionParams}}`</span>
              *
              * Template parameters can only appear in a string or a string in a `LlamaText`.
              *
diff --git a/src/utils/gbnfJson/types.ts b/src/utils/gbnfJson/types.ts
index 5722aacb..4a89babf 100644
--- a/src/utils/gbnfJson/types.ts
+++ b/src/utils/gbnfJson/types.ts
@@ -25,6 +25,9 @@ export type GbnfJsonArraySchema = {
 };
 
 
+/**
+ * Converts a GBNF JSON schema to a TypeScript type
+ */
 export type GbnfJsonSchemaToType<T> = GbnfJsonSchemaToTSType<T>;
 
 export type GbnfJsonSchemaToTSType<T> =

From 51325033f7e93a5c86e7a628981e3b0fe9bc0f20 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 9 Jun 2024 02:45:40 +0300
Subject: [PATCH 39/39] fix: docs compilation

---
 package.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/package.json b/package.json
index 235d53b3..a150e3c9 100644
--- a/package.json
+++ b/package.json
@@ -64,8 +64,7 @@
     "dev:setup": "npm run build && node ./dist/cli/cli.js download --noUsageExample && npm run docs:generateTypedoc && npm run dev:setup:downloadAllTestModels",
     "dev:build": "npm run build && node ./dist/cli/cli.js build --noUsageExample",
     "clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo ./test/.models ./docs/api ./docs/api-overrides ./templates/packed",
-    "docs:generateTypedoc": "typedoc && rimraf ./docs/api/index.md ./docs/api/globals.md ./docs/api/functions/LlamaText.md && npm run docs:generateTypedoc:overrides",
-    "docs:generateTypedoc:overrides": "typedoc --entryPoints ./src/apiDocsOverrides.ts --out ./docs/api-overrides && copyfiles --flat \"./docs/api-overrides/classes/LlamaText.md\" ./docs/api/classes && rimraf ./docs/api-overrides",
+    "docs:generateTypedoc": "typedoc --entryPoints ./src/apiDocsIndex.ts && rimraf ./docs/api/index.md ./docs/api/globals.md ./docs/api/functions/LlamaText.md",
     "docs:dev": "npm run docs:generateTypedoc && vitepress dev",
     "docs:build": "npm run docs:generateTypedoc && vitepress build",
     "docs:preview": "npm run docs:generateTypedoc && vitepress preview"