diff --git a/docs/core_docs/docs/integrations/chat/index.mdx b/docs/core_docs/docs/integrations/chat/index.mdx
index 7251424c42f2..573af3aaa975 100644
--- a/docs/core_docs/docs/integrations/chat/index.mdx
+++ b/docs/core_docs/docs/integrations/chat/index.mdx
@@ -25,7 +25,7 @@ Each ChatModel integration can optionally provide native implementations to trul
| ChatCloudflareWorkersAI | ✅ | ✅ | ✅ |
| ChatFireworks | ✅ | ✅ | ✅ |
| ChatGooglePaLM | ✅ | ❌ | ✅ |
-| ChatLlamaCpp | ✅ | ❌ | ✅ |
+| ChatLlamaCpp | ✅ | ✅ | ✅ |
| ChatMinimax | ✅ | ❌ | ✅ |
| ChatOllama | ✅ | ✅ | ✅ |
| ChatOpenAI | ✅ | ✅ | ✅ |
diff --git a/docs/core_docs/docs/integrations/chat/llama_cpp.mdx b/docs/core_docs/docs/integrations/chat/llama_cpp.mdx
index 420583762827..433caa674b53 100644
--- a/docs/core_docs/docs/integrations/chat/llama_cpp.mdx
+++ b/docs/core_docs/docs/integrations/chat/llama_cpp.mdx
@@ -47,8 +47,16 @@ import SystemExample from "@examples/models/chat/integration_llama_cpp_system.ts
### Chains
-Finally we can also use this module with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.
+This module can also be used with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.
import ChainExample from "@examples/models/chat/integration_llama_cpp_chain.ts";
{ChainExample}
+
+### Streaming
+
+We can also stream with Llama CPP:
+
+import StreamExample from "@examples/models/chat/integration_llama_cpp_stream.ts";
+
+{StreamExample}
diff --git a/docs/core_docs/docs/integrations/llms/index.mdx b/docs/core_docs/docs/integrations/llms/index.mdx
index 5dbd8106e484..c894e5cf4332 100644
--- a/docs/core_docs/docs/integrations/llms/index.mdx
+++ b/docs/core_docs/docs/integrations/llms/index.mdx
@@ -27,7 +27,7 @@ Each LLM integration can optionally provide native implementations for invoke, s
| Fireworks | ✅ | ✅ | ✅ |
| GooglePaLM | ✅ | ❌ | ✅ |
| HuggingFaceInference | ✅ | ❌ | ✅ |
-| LlamaCpp | ✅ | ❌ | ✅ |
+| LlamaCpp | ✅ | ✅ | ✅ |
| Ollama | ✅ | ✅ | ✅ |
| OpenAIChat | ✅ | ✅ | ✅ |
| PromptLayerOpenAIChat | ✅ | ✅ | ✅ |
diff --git a/docs/core_docs/docs/integrations/llms/llama_cpp.mdx b/docs/core_docs/docs/integrations/llms/llama_cpp.mdx
index 0407e643e212..a23ec7acd9e6 100644
--- a/docs/core_docs/docs/integrations/llms/llama_cpp.mdx
+++ b/docs/core_docs/docs/integrations/llms/llama_cpp.mdx
@@ -105,3 +105,9 @@ import CodeBlock from "@theme/CodeBlock";
import LlamaCppExample from "@examples/models/llm/llama_cpp.ts";
{LlamaCppExample}
+
+## Streaming
+
+import LlamaCppStreamExample from "@examples/models/llm/llama_cpp_stream.ts";
+
+{LlamaCppStreamExample};
diff --git a/examples/src/models/chat/integration_llama_cpp_stream.ts b/examples/src/models/chat/integration_llama_cpp_stream.ts
new file mode 100644
index 000000000000..b43ae2320d42
--- /dev/null
+++ b/examples/src/models/chat/integration_llama_cpp_stream.ts
@@ -0,0 +1,29 @@
+import { ChatLlamaCpp } from "langchain/chat_models/llama_cpp";
+
+const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";
+
+const model = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+const stream = await model.stream([
+ ["human", "Tell me a short story about a happy Llama."],
+]);
+
+for await (const chunk of stream) {
+ console.log(chunk.content);
+}
+
+/*
+
+ Once
+ upon
+ a
+ time
+ ,
+ in
+ a
+ green
+ and
+ sunny
+ field
+ ...
+*/
diff --git a/examples/src/models/llm/llama_cpp.ts b/examples/src/models/llm/llama_cpp.ts
index 63dff29efbc0..d5428c544837 100644
--- a/examples/src/models/llm/llama_cpp.ts
+++ b/examples/src/models/llm/llama_cpp.ts
@@ -6,5 +6,5 @@ const question = "Where do Llamas come from?";
const model = new LlamaCpp({ modelPath: llamaPath });
console.log(`You: ${question}`);
-const response = await model.call(question);
+const response = await model.invoke(question);
console.log(`AI : ${response}`);
diff --git a/examples/src/models/llm/llama_cpp_stream.ts b/examples/src/models/llm/llama_cpp_stream.ts
new file mode 100644
index 000000000000..a7ca0e35dd6d
--- /dev/null
+++ b/examples/src/models/llm/llama_cpp_stream.ts
@@ -0,0 +1,30 @@
+import { LlamaCpp } from "langchain/llms/llama_cpp";
+
+const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";
+
+const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+const prompt = "Tell me a short story about a happy Llama.";
+
+const stream = await model.stream(prompt);
+
+for await (const chunk of stream) {
+ console.log(chunk);
+}
+
+/*
+
+
+ Once
+ upon
+ a
+ time
+ ,
+ in
+ the
+ rolling
+ hills
+ of
+ Peru
+ ...
+ */
diff --git a/langchain/src/chat_models/llama_cpp.ts b/langchain/src/chat_models/llama_cpp.ts
index c68712ddf2f5..76ad790708c8 100644
--- a/langchain/src/chat_models/llama_cpp.ts
+++ b/langchain/src/chat_models/llama_cpp.ts
@@ -11,7 +11,12 @@ import {
createLlamaContext,
} from "../util/llama_cpp.js";
import { BaseLanguageModelCallOptions } from "../base_language/index.js";
-import type { BaseMessage } from "../schema/index.js";
+import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
+import {
+ BaseMessage,
+ ChatGenerationChunk,
+ AIMessageChunk,
+} from "../schema/index.js";
/**
* Note that the modelPath is the only required parameter. For testing you
@@ -145,6 +150,42 @@ export class ChatLlamaCpp extends SimpleChatModel {
}
}
+ async *_streamResponseChunks(
+ input: BaseMessage[],
+ _options: this["ParsedCallOptions"],
+ runManager?: CallbackManagerForLLMRun
+ ): AsyncGenerator {
+ if (input.length !== 1) {
+ throw new Error("Only one human message should be provided.");
+ } else {
+ const promptOptions = {
+ temperature: this?.temperature,
+ topK: this?.topK,
+ topP: this?.topP,
+ };
+
+ const stream = await this.caller.call(async () =>
+ this._context.evaluate(
+ this._context.encode(`${input[0].content}`),
+ promptOptions
+ )
+ );
+
+ for await (const chunk of stream) {
+ yield new ChatGenerationChunk({
+ text: this._context.decode([chunk]),
+ message: new AIMessageChunk({
+ content: this._context.decode([chunk]),
+ }),
+ generationInfo: {},
+ });
+ await runManager?.handleLLMNewToken(
+ this._context.decode([chunk]) ?? ""
+ );
+ }
+ }
+ }
+
// This constructs a new session if we need to adding in any sys messages or previous chats
protected _buildSession(messages: BaseMessage[]): string {
let prompt = "";
diff --git a/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts b/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts
index 8bb94ed91671..3f33e5a4d113 100644
--- a/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts
+++ b/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts
@@ -81,3 +81,19 @@ test.skip("Test chain with memory", async () => {
const response3 = await chain.call({ input: "What is your name?" });
console.log({ response3 });
});
+
+test.skip("test streaming call", async () => {
+ const llamaCpp = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+ const stream = await llamaCpp.stream([
+ ["human", "Tell me a short story about a happy Llama."],
+ ]);
+
+ const chunks = [];
+ for await (const chunk of stream) {
+ chunks.push(chunk.content);
+ process.stdout.write(chunks.join(""));
+ }
+
+ expect(chunks.length).toBeGreaterThan(1);
+});
diff --git a/langchain/src/llms/llama_cpp.ts b/langchain/src/llms/llama_cpp.ts
index 44ae0009ff61..5081bde04008 100644
--- a/langchain/src/llms/llama_cpp.ts
+++ b/langchain/src/llms/llama_cpp.ts
@@ -6,6 +6,8 @@ import {
createLlamaSession,
} from "../util/llama_cpp.js";
import { LLM, BaseLLMCallOptions, BaseLLMParams } from "./base.js";
+import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
+import { GenerationChunk } from "../schema/index.js";
/**
* Note that the modelPath is the only required parameter. For testing you
@@ -70,8 +72,7 @@ export class LlamaCpp extends LLM {
/** @ignore */
async _call(
prompt: string,
- // @ts-expect-error - TS6133: 'options' is declared but its value is never read.
- options?: this["ParsedCallOptions"]
+ _options?: this["ParsedCallOptions"]
): Promise {
try {
const promptOptions = {
@@ -87,4 +88,28 @@ export class LlamaCpp extends LLM {
throw new Error("Error getting prompt completion.");
}
}
+
+ async *_streamResponseChunks(
+ prompt: string,
+ _options: this["ParsedCallOptions"],
+ runManager?: CallbackManagerForLLMRun
+ ): AsyncGenerator {
+ const promptOptions = {
+ temperature: this?.temperature,
+ topK: this?.topK,
+ topP: this?.topP,
+ };
+
+ const stream = await this.caller.call(async () =>
+ this._context.evaluate(this._context.encode(prompt), promptOptions)
+ );
+
+ for await (const chunk of stream) {
+ yield new GenerationChunk({
+ text: this._context.decode([chunk]),
+ generationInfo: {},
+ });
+ await runManager?.handleLLMNewToken(this._context.decode([chunk]) ?? "");
+ }
+ }
}
diff --git a/langchain/src/llms/tests/llama_cpp.int.test.ts b/langchain/src/llms/tests/llama_cpp.int.test.ts
index 85dd917ae4e8..d0fe6cc4268e 100644
--- a/langchain/src/llms/tests/llama_cpp.int.test.ts
+++ b/langchain/src/llms/tests/llama_cpp.int.test.ts
@@ -29,3 +29,19 @@ test.skip("Test Llama_CPP", async () => {
);
}
}, 100000);
+
+test.skip("Test Llama_CPP", async () => {
+ const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+ const stream = await model.stream(
+ "Tell me a short story about a happy Llama."
+ );
+
+ const chunks = [];
+ for await (const chunk of stream) {
+ chunks.push(chunk);
+ process.stdout.write(chunks.join(""));
+ }
+
+ expect(chunks.length).toBeGreaterThan(1);
+});