integration[minor]: Llama Cpp streaming (#3394)

* Got streaming working in LLM & Chat * Linted streaming and added docs. * Small fixes * Update llama_cpp.mdx --------- Co-authored-by: jacoblee93 <[email protected]>
langchain-ai · Nov 28, 2023 · aef5627 · aef5627 · vercel · Nov 28, 2023
1 parent ed51ace
commit aef5627
Show file tree

Hide file tree

Showing 11 changed files with 178 additions and 7 deletions.
diff --git a/docs/core_docs/docs/integrations/chat/index.mdx b/docs/core_docs/docs/integrations/chat/index.mdx
@@ -25,7 +25,7 @@ Each ChatModel integration can optionally provide native implementations to trul
 | ChatCloudflareWorkersAI |   ✅   |   ✅   |  ✅   |
 | ChatFireworks           |   ✅   |   ✅   |  ✅   |
 | ChatGooglePaLM          |   ✅   |   ❌   |  ✅   |
-| ChatLlamaCpp            |   ✅   |   ❌   |  ✅   |
+| ChatLlamaCpp            |   ✅   |   ✅   |  ✅   |
 | ChatMinimax             |   ✅   |   ❌   |  ✅   |
 | ChatOllama              |   ✅   |   ✅   |  ✅   |
 | ChatOpenAI              |   ✅   |   ✅   |  ✅   |

diff --git a/docs/core_docs/docs/integrations/chat/llama_cpp.mdx b/docs/core_docs/docs/integrations/chat/llama_cpp.mdx
@@ -47,8 +47,16 @@ import SystemExample from "@examples/models/chat/integration_llama_cpp_system.ts
 
 ### Chains
 
-Finally we can also use this module with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.
+This module can also be used with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.
 
 import ChainExample from "@examples/models/chat/integration_llama_cpp_chain.ts";
 
 <CodeBlock language="typescript">{ChainExample}</CodeBlock>
+
+### Streaming
+
+We can also stream with Llama CPP:
+
+import StreamExample from "@examples/models/chat/integration_llama_cpp_stream.ts";
+
+<CodeBlock language="typescript">{StreamExample}</CodeBlock>
diff --git a/docs/core_docs/docs/integrations/llms/index.mdx b/docs/core_docs/docs/integrations/llms/index.mdx
@@ -27,7 +27,7 @@ Each LLM integration can optionally provide native implementations for invoke, s
 | Fireworks             |   ✅   |   ✅   |  ✅   |
 | GooglePaLM            |   ✅   |   ❌   |  ✅   |
 | HuggingFaceInference  |   ✅   |   ❌   |  ✅   |
-| LlamaCpp              |   ✅   |   ❌   |  ✅   |
+| LlamaCpp              |   ✅   |   ✅   |  ✅   |
 | Ollama                |   ✅   |   ✅   |  ✅   |
 | OpenAIChat            |   ✅   |   ✅   |  ✅   |
 | PromptLayerOpenAIChat |   ✅   |   ✅   |  ✅   |

diff --git a/docs/core_docs/docs/integrations/llms/llama_cpp.mdx b/docs/core_docs/docs/integrations/llms/llama_cpp.mdx
@@ -105,3 +105,9 @@ import CodeBlock from "@theme/CodeBlock";
 import LlamaCppExample from "@examples/models/llm/llama_cpp.ts";
 
 <CodeBlock language="typescript">{LlamaCppExample}</CodeBlock>
+
+## Streaming
+
+import LlamaCppStreamExample from "@examples/models/llm/llama_cpp_stream.ts";
+
+<CodeBlock language="typescript">{LlamaCppStreamExample}</CodeBlock>;
diff --git a/examples/src/models/chat/integration_llama_cpp_stream.ts b/examples/src/models/chat/integration_llama_cpp_stream.ts
@@ -0,0 +1,29 @@
+import { ChatLlamaCpp } from "langchain/chat_models/llama_cpp";
+
+const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";
+
+const model = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+const stream = await model.stream([
+  ["human", "Tell me a short story about a happy Llama."],
+]);
+
+for await (const chunk of stream) {
+  console.log(chunk.content);
+}
+
+/*
+
+  Once
+   upon
+   a
+   time
+  ,
+   in
+   a
+   green
+   and
+   sunny
+   field
+  ...
+*/
diff --git a/examples/src/models/llm/llama_cpp.ts b/examples/src/models/llm/llama_cpp.ts
@@ -6,5 +6,5 @@ const question = "Where do Llamas come from?";
 const model = new LlamaCpp({ modelPath: llamaPath });
 
 console.log(`You: ${question}`);
-const response = await model.call(question);
+const response = await model.invoke(question);
 console.log(`AI : ${response}`);
diff --git a/examples/src/models/llm/llama_cpp_stream.ts b/examples/src/models/llm/llama_cpp_stream.ts
@@ -0,0 +1,30 @@
+import { LlamaCpp } from "langchain/llms/llama_cpp";
+
+const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";
+
+const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+const prompt = "Tell me a short story about a happy Llama.";
+
+const stream = await model.stream(prompt);
+
+for await (const chunk of stream) {
+  console.log(chunk);
+}
+
+/*
+
+
+ Once
+  upon
+  a
+  time
+ ,
+  in
+  the
+  rolling
+  hills
+  of
+  Peru
+ ...
+ */
diff --git a/langchain/src/chat_models/llama_cpp.ts b/langchain/src/chat_models/llama_cpp.ts
@@ -11,7 +11,12 @@ import {
   createLlamaContext,
 } from "../util/llama_cpp.js";
 import { BaseLanguageModelCallOptions } from "../base_language/index.js";
-import type { BaseMessage } from "../schema/index.js";
+import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
+import {
+  BaseMessage,
+  ChatGenerationChunk,
+  AIMessageChunk,
+} from "../schema/index.js";
 
 /**
  * Note that the modelPath is the only required parameter. For testing you
@@ -145,6 +150,42 @@ export class ChatLlamaCpp extends SimpleChatModel<LlamaCppCallOptions> {
     }
   }
 
+  async *_streamResponseChunks(
+    input: BaseMessage[],
+    _options: this["ParsedCallOptions"],
+    runManager?: CallbackManagerForLLMRun
+  ): AsyncGenerator<ChatGenerationChunk> {
+    if (input.length !== 1) {
+      throw new Error("Only one human message should be provided.");
+    } else {
+      const promptOptions = {
+        temperature: this?.temperature,
+        topK: this?.topK,
+        topP: this?.topP,
+      };
+
+      const stream = await this.caller.call(async () =>
+        this._context.evaluate(
+          this._context.encode(`${input[0].content}`),
+          promptOptions
+        )
+      );
+
+      for await (const chunk of stream) {
+        yield new ChatGenerationChunk({
+          text: this._context.decode([chunk]),
+          message: new AIMessageChunk({
+            content: this._context.decode([chunk]),
+          }),
+          generationInfo: {},
+        });
+        await runManager?.handleLLMNewToken(
+          this._context.decode([chunk]) ?? ""
+        );
+      }
+    }
+  }
+
   // This constructs a new session if we need to adding in any sys messages or previous chats
   protected _buildSession(messages: BaseMessage[]): string {
     let prompt = "";

diff --git a/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts b/langchain/src/chat_models/tests/chatllama_cpp.int.test.ts
@@ -81,3 +81,19 @@ test.skip("Test chain with memory", async () => {
   const response3 = await chain.call({ input: "What is your name?" });
   console.log({ response3 });
 });
+
+test.skip("test streaming call", async () => {
+  const llamaCpp = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+  const stream = await llamaCpp.stream([
+    ["human", "Tell me a short story about a happy Llama."],
+  ]);
+
+  const chunks = [];
+  for await (const chunk of stream) {
+    chunks.push(chunk.content);
+    process.stdout.write(chunks.join(""));
+  }
+
+  expect(chunks.length).toBeGreaterThan(1);
+});
diff --git a/langchain/src/llms/llama_cpp.ts b/langchain/src/llms/llama_cpp.ts
@@ -6,6 +6,8 @@ import {
   createLlamaSession,
 } from "../util/llama_cpp.js";
 import { LLM, BaseLLMCallOptions, BaseLLMParams } from "./base.js";
+import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
+import { GenerationChunk } from "../schema/index.js";
 
 /**
  * Note that the modelPath is the only required parameter. For testing you
@@ -70,8 +72,7 @@ export class LlamaCpp extends LLM<LlamaCppCallOptions> {
   /** @ignore */
   async _call(
     prompt: string,
-    // @ts-expect-error - TS6133: 'options' is declared but its value is never read.
-    options?: this["ParsedCallOptions"]
+    _options?: this["ParsedCallOptions"]
   ): Promise<string> {
     try {
       const promptOptions = {
@@ -87,4 +88,28 @@ export class LlamaCpp extends LLM<LlamaCppCallOptions> {
       throw new Error("Error getting prompt completion.");
     }
   }
+
+  async *_streamResponseChunks(
+    prompt: string,
+    _options: this["ParsedCallOptions"],
+    runManager?: CallbackManagerForLLMRun
+  ): AsyncGenerator<GenerationChunk> {
+    const promptOptions = {
+      temperature: this?.temperature,
+      topK: this?.topK,
+      topP: this?.topP,
+    };
+
+    const stream = await this.caller.call(async () =>
+      this._context.evaluate(this._context.encode(prompt), promptOptions)
+    );
+
+    for await (const chunk of stream) {
+      yield new GenerationChunk({
+        text: this._context.decode([chunk]),
+        generationInfo: {},
+      });
+      await runManager?.handleLLMNewToken(this._context.decode([chunk]) ?? "");
+    }
+  }
 }
diff --git a/langchain/src/llms/tests/llama_cpp.int.test.ts b/langchain/src/llms/tests/llama_cpp.int.test.ts
@@ -29,3 +29,19 @@ test.skip("Test Llama_CPP", async () => {
     );
   }
 }, 100000);
+
+test.skip("Test Llama_CPP", async () => {
+  const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });
+
+  const stream = await model.stream(
+    "Tell me a short story about a happy Llama."
+  );
+
+  const chunks = [];
+  for await (const chunk of stream) {
+    chunks.push(chunk);
+    process.stdout.write(chunks.join(""));
+  }
+
+  expect(chunks.length).toBeGreaterThan(1);
+});