Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: threads count setting on a model #33

Merged
merged 2 commits into from
Sep 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ Optional:
-c, --contextSize Context size to use for the model [number] [default: 4096]
-g, --grammar Restrict the model response to a specific grammar, like JSON for example
[string] [choices: "text", "json", "list", "arithmetic", "japanese", "chess"] [default: "text"]
--threads Number of threads to use for the evaluation of tokens [number] [default: 6]
-t, --temperature Temperature is a hyperparameter that controls the randomness of the generat
ed text. It affects the probability distribution of the model's output toke
ns. A higher temperature (e.g., 1.5) makes the output more random and creat
Expand Down
8 changes: 7 additions & 1 deletion llama/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
llama_context_params params;
llama_model* model;
float temperature;
int threads;
int32_t top_k;
float top_p;

Expand All @@ -21,6 +22,7 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
params.seed = -1;
params.n_ctx = 4096;
temperature = 0.0f;
threads = 6;
top_k = 40;
top_p = 0.95f;

Expand Down Expand Up @@ -74,6 +76,10 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
}

if (options.Has("threads")) {
threads = options.Get("threads").As<Napi::Number>().Int32Value();
}

if (options.Has("temperature")) {
temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
}
Expand Down Expand Up @@ -283,7 +289,7 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
protected:
void Execute() {
// Perform the evaluation using llama_eval.
int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), 6);
int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), (ctx->model)->threads);
if (r != 0) {
SetError("Eval has failed");
return;
Expand Down
14 changes: 11 additions & 3 deletions src/cli/commands/ChatCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type ChatCommand = {
wrapper: "auto" | "general" | "llamaChat" | "chatML",
contextSize: number,
grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[0],
threads: number,
temperature: number,
topK: number,
topP: number,
Expand Down Expand Up @@ -76,6 +77,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
description: "Restrict the model response to a specific grammar, like JSON for example",
group: "Optional:"
})
.option("threads", {
type: "number",
default: 6,
description: "Number of threads to use for the evaluation of tokens",
group: "Optional:"
})
.option("temperature", {
alias: "t",
type: "number",
Expand Down Expand Up @@ -107,10 +114,10 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
},
async handler({
model, systemInfo, systemPrompt, wrapper, contextSize, grammar,
temperature, topK, topP, maxTokens
threads, temperature, topK, topP, maxTokens
}) {
try {
await RunChat({model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens});
await RunChat({model, systemInfo, systemPrompt, wrapper, contextSize, grammar, threads, temperature, topK, topP, maxTokens});
} catch (err) {
console.error(err);
process.exit(1);
Expand All @@ -120,7 +127,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {


async function RunChat({
model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, temperature, topK, topP, maxTokens
model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, threads, temperature, topK, topP, maxTokens
}: ChatCommand) {
const {LlamaChatSession} = await import("../../llamaEvaluator/LlamaChatSession.js");
const {LlamaModel} = await import("../../llamaEvaluator/LlamaModel.js");
Expand All @@ -130,6 +137,7 @@ async function RunChat({
const model = new LlamaModel({
modelPath: modelArg,
contextSize,
threads,
temperature,
topK,
topP
Expand Down
7 changes: 6 additions & 1 deletion src/llamaEvaluator/LlamaModel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ export type LlamaModelOptions = {
/** if true, reduce VRAM usage at the cost of performance */
lowVram?: boolean,

/** number of threads to use to evaluate tokens */
threads?: number,

/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
Expand Down Expand Up @@ -85,6 +88,7 @@ export class LlamaModel {
* @param {number} [options.batchSize] - prompt processing batch size
* @param {number} [options.gpuLayers] - number of layers to store in VRAM
* @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
* @param {number} [options.threads] - number of threads to use to evaluate tokens
* @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
* A higher temperature (e.g., 1.5) makes the output more random and creative,
Expand Down Expand Up @@ -114,14 +118,15 @@ export class LlamaModel {
*/
public constructor({
modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers,
lowVram, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
lowVram, threads = 6, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
}: LlamaModelOptions) {
this._model = new LLAMAModel(modelPath, removeNullFields({
seed: seed != null ? Math.max(-1, seed) : undefined,
contextSize,
batchSize,
gpuLayers,
lowVram,
threads,
temperature,
topK,
topP,
Expand Down
1 change: 1 addition & 0 deletions src/utils/getBin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ export type LLAMAModel = {
useMmap?: boolean,
useMlock?: boolean,
embedding?: boolean,
threads?: number,
temperature?: number,
topK?: number,
topP?: number
Expand Down