Skip to content

Commit

Permalink
integration[minor]: Llama Cpp streaming (#3394)
Browse files Browse the repository at this point in the history
* Got streaming working in LLM & Chat

* Linted streaming and added docs.

* Small fixes

* Update llama_cpp.mdx

---------

Co-authored-by: jacoblee93 <[email protected]>
  • Loading branch information
nigel-daniels and jacoblee93 authored Nov 28, 2023
1 parent ed51ace commit aef5627
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/core_docs/docs/integrations/chat/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Each ChatModel integration can optionally provide native implementations to trul
| ChatCloudflareWorkersAI | | | |
| ChatFireworks | | | |
| ChatGooglePaLM | | | |
| ChatLlamaCpp | | | |
| ChatLlamaCpp | | | |
| ChatMinimax | | | |
| ChatOllama | | | |
| ChatOpenAI | | | |
Expand Down
10 changes: 9 additions & 1 deletion docs/core_docs/docs/integrations/chat/llama_cpp.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,16 @@ import SystemExample from "@examples/models/chat/integration_llama_cpp_system.ts

### Chains

Finally we can also use this module with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.
This module can also be used with chains, note that using more complex chains will require suitably powerful version of `llama2` such as the 70B version.

import ChainExample from "@examples/models/chat/integration_llama_cpp_chain.ts";

<CodeBlock language="typescript">{ChainExample}</CodeBlock>

### Streaming

We can also stream with Llama CPP:

import StreamExample from "@examples/models/chat/integration_llama_cpp_stream.ts";

<CodeBlock language="typescript">{StreamExample}</CodeBlock>
2 changes: 1 addition & 1 deletion docs/core_docs/docs/integrations/llms/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Each LLM integration can optionally provide native implementations for invoke, s
| Fireworks | | | |
| GooglePaLM | | | |
| HuggingFaceInference | | | |
| LlamaCpp | | | |
| LlamaCpp | | | |
| Ollama | | | |
| OpenAIChat | | | |
| PromptLayerOpenAIChat | | | |
Expand Down
6 changes: 6 additions & 0 deletions docs/core_docs/docs/integrations/llms/llama_cpp.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,9 @@ import CodeBlock from "@theme/CodeBlock";
import LlamaCppExample from "@examples/models/llm/llama_cpp.ts";

<CodeBlock language="typescript">{LlamaCppExample}</CodeBlock>

## Streaming

import LlamaCppStreamExample from "@examples/models/llm/llama_cpp_stream.ts";

<CodeBlock language="typescript">{LlamaCppStreamExample}</CodeBlock>;
29 changes: 29 additions & 0 deletions examples/src/models/chat/integration_llama_cpp_stream.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { ChatLlamaCpp } from "langchain/chat_models/llama_cpp";

const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";

const model = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });

const stream = await model.stream([
["human", "Tell me a short story about a happy Llama."],
]);

for await (const chunk of stream) {
console.log(chunk.content);
}

/*
Once
upon
a
time
,
in
a
green
and
sunny
field
...
*/
2 changes: 1 addition & 1 deletion examples/src/models/llm/llama_cpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ const question = "Where do Llamas come from?";
const model = new LlamaCpp({ modelPath: llamaPath });

console.log(`You: ${question}`);
const response = await model.call(question);
const response = await model.invoke(question);
console.log(`AI : ${response}`);
30 changes: 30 additions & 0 deletions examples/src/models/llm/llama_cpp_stream.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { LlamaCpp } from "langchain/llms/llama_cpp";

const llamaPath = "/Replace/with/path/to/your/model/gguf-llama2-q4_0.bin";

const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });

const prompt = "Tell me a short story about a happy Llama.";

const stream = await model.stream(prompt);

for await (const chunk of stream) {
console.log(chunk);
}

/*
Once
upon
a
time
,
in
the
rolling
hills
of
Peru
...
*/
43 changes: 42 additions & 1 deletion langchain/src/chat_models/llama_cpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ import {
createLlamaContext,
} from "../util/llama_cpp.js";
import { BaseLanguageModelCallOptions } from "../base_language/index.js";
import type { BaseMessage } from "../schema/index.js";
import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
import {
BaseMessage,
ChatGenerationChunk,
AIMessageChunk,
} from "../schema/index.js";

/**
* Note that the modelPath is the only required parameter. For testing you
Expand Down Expand Up @@ -145,6 +150,42 @@ export class ChatLlamaCpp extends SimpleChatModel<LlamaCppCallOptions> {
}
}

async *_streamResponseChunks(
input: BaseMessage[],
_options: this["ParsedCallOptions"],
runManager?: CallbackManagerForLLMRun
): AsyncGenerator<ChatGenerationChunk> {
if (input.length !== 1) {
throw new Error("Only one human message should be provided.");
} else {
const promptOptions = {
temperature: this?.temperature,
topK: this?.topK,
topP: this?.topP,
};

const stream = await this.caller.call(async () =>
this._context.evaluate(
this._context.encode(`${input[0].content}`),
promptOptions
)
);

for await (const chunk of stream) {
yield new ChatGenerationChunk({
text: this._context.decode([chunk]),
message: new AIMessageChunk({
content: this._context.decode([chunk]),
}),
generationInfo: {},
});
await runManager?.handleLLMNewToken(
this._context.decode([chunk]) ?? ""
);
}
}
}

// This constructs a new session if we need to adding in any sys messages or previous chats
protected _buildSession(messages: BaseMessage[]): string {
let prompt = "";
Expand Down
16 changes: 16 additions & 0 deletions langchain/src/chat_models/tests/chatllama_cpp.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,19 @@ test.skip("Test chain with memory", async () => {
const response3 = await chain.call({ input: "What is your name?" });
console.log({ response3 });
});

test.skip("test streaming call", async () => {
const llamaCpp = new ChatLlamaCpp({ modelPath: llamaPath, temperature: 0.7 });

const stream = await llamaCpp.stream([
["human", "Tell me a short story about a happy Llama."],
]);

const chunks = [];
for await (const chunk of stream) {
chunks.push(chunk.content);
process.stdout.write(chunks.join(""));
}

expect(chunks.length).toBeGreaterThan(1);
});
29 changes: 27 additions & 2 deletions langchain/src/llms/llama_cpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import {
createLlamaSession,
} from "../util/llama_cpp.js";
import { LLM, BaseLLMCallOptions, BaseLLMParams } from "./base.js";
import { CallbackManagerForLLMRun } from "../callbacks/manager.js";
import { GenerationChunk } from "../schema/index.js";

/**
* Note that the modelPath is the only required parameter. For testing you
Expand Down Expand Up @@ -70,8 +72,7 @@ export class LlamaCpp extends LLM<LlamaCppCallOptions> {
/** @ignore */
async _call(
prompt: string,
// @ts-expect-error - TS6133: 'options' is declared but its value is never read.
options?: this["ParsedCallOptions"]
_options?: this["ParsedCallOptions"]
): Promise<string> {
try {
const promptOptions = {
Expand All @@ -87,4 +88,28 @@ export class LlamaCpp extends LLM<LlamaCppCallOptions> {
throw new Error("Error getting prompt completion.");
}
}

async *_streamResponseChunks(
prompt: string,
_options: this["ParsedCallOptions"],
runManager?: CallbackManagerForLLMRun
): AsyncGenerator<GenerationChunk> {
const promptOptions = {
temperature: this?.temperature,
topK: this?.topK,
topP: this?.topP,
};

const stream = await this.caller.call(async () =>
this._context.evaluate(this._context.encode(prompt), promptOptions)
);

for await (const chunk of stream) {
yield new GenerationChunk({
text: this._context.decode([chunk]),
generationInfo: {},
});
await runManager?.handleLLMNewToken(this._context.decode([chunk]) ?? "");
}
}
}
16 changes: 16 additions & 0 deletions langchain/src/llms/tests/llama_cpp.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,19 @@ test.skip("Test Llama_CPP", async () => {
);
}
}, 100000);

test.skip("Test Llama_CPP", async () => {
const model = new LlamaCpp({ modelPath: llamaPath, temperature: 0.7 });

const stream = await model.stream(
"Tell me a short story about a happy Llama."
);

const chunks = [];
for await (const chunk of stream) {
chunks.push(chunk);
process.stdout.write(chunks.join(""));
}

expect(chunks.length).toBeGreaterThan(1);
});

1 comment on commit aef5627

@vercel
Copy link

@vercel vercel bot commented on aef5627 Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

langchainjs-docs – ./docs/core_docs/

langchainjs-docs-ruddy.vercel.app
langchainjs-docs-langchain.vercel.app
langchainjs-docs-git-main-langchain.vercel.app
js.langchain.com

Please sign in to comment.