diff --git a/docs/core_docs/docs/integrations/chat/index.mdx b/docs/core_docs/docs/integrations/chat/index.mdx index 7251424c42f2..573af3aaa975 100644 --- a/docs/core_docs/docs/integrations/chat/index.mdx +++ b/docs/core_docs/docs/integrations/chat/index.mdx @@ -25,7 +25,7 @@ Each ChatModel integration can optionally provide native implementations to trul | ChatCloudflareWorkersAI | ✅ | ✅ | ✅ | | ChatFireworks | ✅ | ✅ | ✅ | | ChatGooglePaLM | ✅ | ❌ | ✅ | -| ChatLlamaCpp | ✅ | ❌ | ✅ | +| ChatLlamaCpp | ✅ | ✅ | ✅ | | ChatMinimax | ✅ | ❌ | ✅ | | ChatOllama | ✅ | ✅ | ✅ | | ChatOpenAI | ✅ | ✅ | ✅ | diff --git a/docs/core_docs/docs/integrations/chat/llama_cpp.mdx b/docs/core_docs/docs/integrations/chat/llama_cpp.mdx index 420583762827..433caa674b53 100644 --- a/docs/core_docs/docs/integrations/chat/llama_cpp.mdx +++ b/docs/core_docs/docs/integrations/chat/llama_cpp.mdx @@ -47,8 +47,16 @@ import SystemExample from "@examples/models/chat/integration_llama_cpp_system.ts ### Chains -Finally we can also use this module with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version. +This module can also be used with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version. import ChainExample from "@examples/models/chat/integration_llama_cpp_chain.ts"; {ChainExample} + +### Streaming + +We can also stream with Llama CPP: + +import StreamExample from "@examples/models/chat/integration_llama_cpp_stream.ts"; + +{StreamExample} diff --git a/docs/core_docs/docs/integrations/llms/index.mdx b/docs/core_docs/docs/integrations/llms/index.mdx index 5dbd8106e484..c894e5cf4332 100644 --- a/docs/core_docs/docs/integrations/llms/index.mdx +++ b/docs/core_docs/docs/integrations/llms/index.mdx @@ -27,7 +27,7 @@ Each LLM integration can optionally provide native implementations for invoke, s | Fireworks | ✅ | ✅ | ✅ | | GooglePaLM | ✅ | ❌ | ✅ | | HuggingFaceInference | ✅ | ❌ | ✅ | -| LlamaCpp | ✅ | ❌ | ✅ | +| LlamaCpp | ✅ | ✅ | ✅ | | Ollama | ✅ | ✅ | ✅ | | OpenAIChat | ✅ | ✅ | ✅ | | PromptLayerOpenAIChat | ✅ | ✅ | ✅ | diff --git a/docs/core_docs/docs/integrations/llms/llama_cpp.mdx b/docs/core_docs/docs/integrations/llms/llama_cpp.mdx index 0407e643e212..a23ec7acd9e6 100644 --- a/docs/core_docs/docs/integrations/llms/llama_cpp.mdx +++ b/docs/core_docs/docs/integrations/llms/llama_cpp.mdx @@ -105,3 +105,9 @@ import CodeBlock from "@theme/CodeBlock"; import LlamaCppExample from "@examples/models/llm/llama_cpp.ts"; {LlamaCppExample} + +## Streaming + +import LlamaCppStreamExample from "@examples/models/llm/llama_cpp_stream.ts"; + +{LlamaCppStreamExample}; diff --git a/examples/src/models/chat/integration_llama_cpp_stream.ts b/examples/src/models/chat/integration_llama_cpp_stream.ts new file mode 100644 index 000000000000..b43ae2320d42 --- /dev/null +++ b/examples/src/models/chat/integration_llama_cpp_stream.ts @@ -0,0 +1,29 @@ +import { ChatLlamaCpp } from "langchain/chat_models/llama_cpp"; + +const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin"; + +const model = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 }); + +const stream = await model.stream([ + ["human", "Tell me a short story about a happy Llama."], +]); + +for await (const chunk of stream) { + console.log(chunk.content); +} + +/* + + Once + upon + a + time + , + in + a + green + and + sunny + field + ... +*/ diff --git a/examples/src/models/llm/llama_cpp.ts b/examples/src/models/llm/llama_cpp.ts index 63dff29efbc0..d5428c544837 100644 --- a/examples/src/models/llm/llama_cpp.ts +++ b/examples/src/models/llm/llama_cpp.ts @@ -6,5 +6,5 @@ const question = "Where do Llamas come from?"; const model = new LlamaCpp({ modelPath: llamaPath }); console.log(`You: ${question}`); -const response = await model.call(question); +const response = await model.invoke(question); console.log(`AI : ${response}`); diff --git a/examples/src/models/llm/llama_cpp_stream.ts b/examples/src/models/llm/llama_cpp_stream.ts new file mode 100644 index 000000000000..a7ca0e35dd6d --- /dev/null +++ b/examples/src/models/llm/llama_cpp_stream.ts @@ -0,0 +1,30 @@ +import { LlamaCpp } from "langchain/llms/llama_cpp"; + +const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin"; + +const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 }); + +const prompt = "Tell me a short story about a happy Llama."; + +const stream = await model.stream(prompt); + +for await (const chunk of stream) { + console.log(chunk); +} + +/* + + + Once + upon + a + time + , + in + the + rolling + hills + of + Peru + ... + */ diff --git a/langchain/src/chat_models/llama_cpp.ts b/langchain/src/chat_models/llama_cpp.ts index c68712ddf2f5..76ad790708c8 100644 --- a/langchain/src/chat_models/llama_cpp.ts +++ b/langchain/src/chat_models/llama_cpp.ts @@ -11,7 +11,12 @@ import { createLlamaContext, } from "../util/llama_cpp.js"; import { BaseLanguageModelCallOptions } from "../base_language/index.js"; -import type { BaseMessage } from "../schema/index.js"; +import { CallbackManagerForLLMRun } from "../callbacks/manager.js"; +import { + BaseMessage, + ChatGenerationChunk, + AIMessageChunk, +} from "../schema/index.js"; /** * Note that the modelPath is the only required parameter. For testing you @@ -145,6 +150,42 @@ export class ChatLlamaCpp extends SimpleChatModel { } } + async *_streamResponseChunks( + input: BaseMessage[], + _options: this["ParsedCallOptions"], + runManager?: CallbackManagerForLLMRun + ): AsyncGenerator { + if (input.length !== 1) { + throw new Error("Only one human message should be provided."); + } else { + const promptOptions = { + temperature: this?.temperature, + topK: this?.topK, + topP: this?.topP, + }; + + const stream = await this.caller.call(async () => + this._context.evaluate( + this._context.encode(`${input[0].content}`), + promptOptions + ) + ); + + for await (const chunk of stream) { + yield new ChatGenerationChunk({ + text: this._context.decode([chunk]), + message: new AIMessageChunk({ + content: this._context.decode([chunk]), + }), + generationInfo: {}, + }); + await runManager?.handleLLMNewToken( + this._context.decode([chunk]) ?? "" + ); + } + } + } + // This constructs a new session if we need to adding in any sys messages or previous chats protected _buildSession(messages: BaseMessage[]): string { let prompt = ""; diff --git a/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts b/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts index 8bb94ed91671..3f33e5a4d113 100644 --- a/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts +++ b/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts @@ -81,3 +81,19 @@ test.skip("Test chain with memory", async () => { const response3 = await chain.call({ input: "What is your name?" }); console.log({ response3 }); }); + +test.skip("test streaming call", async () => { + const llamaCpp = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 }); + + const stream = await llamaCpp.stream([ + ["human", "Tell me a short story about a happy Llama."], + ]); + + const chunks = []; + for await (const chunk of stream) { + chunks.push(chunk.content); + process.stdout.write(chunks.join("")); + } + + expect(chunks.length).toBeGreaterThan(1); +}); diff --git a/langchain/src/llms/llama_cpp.ts b/langchain/src/llms/llama_cpp.ts index 44ae0009ff61..5081bde04008 100644 --- a/langchain/src/llms/llama_cpp.ts +++ b/langchain/src/llms/llama_cpp.ts @@ -6,6 +6,8 @@ import { createLlamaSession, } from "../util/llama_cpp.js"; import { LLM, BaseLLMCallOptions, BaseLLMParams } from "./base.js"; +import { CallbackManagerForLLMRun } from "../callbacks/manager.js"; +import { GenerationChunk } from "../schema/index.js"; /** * Note that the modelPath is the only required parameter. For testing you @@ -70,8 +72,7 @@ export class LlamaCpp extends LLM { /** @ignore */ async _call( prompt: string, - // @ts-expect-error - TS6133: 'options' is declared but its value is never read. - options?: this["ParsedCallOptions"] + _options?: this["ParsedCallOptions"] ): Promise { try { const promptOptions = { @@ -87,4 +88,28 @@ export class LlamaCpp extends LLM { throw new Error("Error getting prompt completion."); } } + + async *_streamResponseChunks( + prompt: string, + _options: this["ParsedCallOptions"], + runManager?: CallbackManagerForLLMRun + ): AsyncGenerator { + const promptOptions = { + temperature: this?.temperature, + topK: this?.topK, + topP: this?.topP, + }; + + const stream = await this.caller.call(async () => + this._context.evaluate(this._context.encode(prompt), promptOptions) + ); + + for await (const chunk of stream) { + yield new GenerationChunk({ + text: this._context.decode([chunk]), + generationInfo: {}, + }); + await runManager?.handleLLMNewToken(this._context.decode([chunk]) ?? ""); + } + } } diff --git a/langchain/src/llms/tests/llama_cpp.int.test.ts b/langchain/src/llms/tests/llama_cpp.int.test.ts index 85dd917ae4e8..d0fe6cc4268e 100644 --- a/langchain/src/llms/tests/llama_cpp.int.test.ts +++ b/langchain/src/llms/tests/llama_cpp.int.test.ts @@ -29,3 +29,19 @@ test.skip("Test Llama_CPP", async () => { ); } }, 100000); + +test.skip("Test Llama_CPP", async () => { + const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 }); + + const stream = await model.stream( + "Tell me a short story about a happy Llama." + ); + + const chunks = []; + for await (const chunk of stream) { + chunks.push(chunk); + process.stdout.write(chunks.join("")); + } + + expect(chunks.length).toBeGreaterThan(1); +});