Skip to content

Commit

Permalink
Synchronize with llama.cpp upstream
Browse files Browse the repository at this point in the history
AMD GPU support on Windows is still in a broken state.
  • Loading branch information
jart committed Apr 20, 2024
1 parent b52304e commit 850f43d
Show file tree
Hide file tree
Showing 21 changed files with 2,370 additions and 2,310 deletions.
4 changes: 2 additions & 2 deletions llama.cpp/README.llamafile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ LICENSE
ORIGIN

https://github.com/ggerganov/llama.cpp/pull/4406/
67fac4b95fcccfda8ab965e9ba4992a9ddf3a25f
2024-04-10
b8109bc0139f15a5b321909f47510b89dca47ffc
2024-04-21

LOCAL MODIFICATIONS

Expand Down
97 changes: 92 additions & 5 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi

#include "common.h"
#include "json.h"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "ggml-cuda.h"
#include "ggml-metal.h"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cerrno>
#include <cstring>
#include <climits>
#include <ctime>
#include <fstream>
#include <iterator>
Expand Down Expand Up @@ -74,6 +73,8 @@
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL

using json = nlohmann::ordered_json;

int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
Expand Down Expand Up @@ -110,6 +111,79 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#include <pthread.h>

static void cpuid(unsigned leaf, unsigned subleaf,
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
__asm__("movq\t%%rbx,%%rsi\n\t"
"cpuid\n\t"
"xchgq\t%%rbx,%%rsi"
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
: "0"(leaf), "2"(subleaf));
}

static int pin_cpu(int cpu) {
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
}

static bool is_hybrid_cpu(void) {
unsigned eax, ebx, ecx, edx;
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
return !!(edx & (1u << 15));
}

static bool is_running_on_efficiency_core(void) {
unsigned eax, ebx, ecx, edx;
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
int intel_atom = 0x20;
int core_type = (eax & 0xff000000u) >> 24;
return core_type == intel_atom;
}

static int count_math_cpus(int cpu_count) {
int result = 0;
for (int cpu = 0; cpu < cpu_count; ++cpu) {
if (pin_cpu(cpu)) {
return -1;
}
if (is_running_on_efficiency_core()) {
continue; // efficiency cores harm lockstep threading
}
++cpu; // hyperthreading isn't useful for linear algebra
++result;
}
return result;
}

#endif // __x86_64__ && __linux__

/**
* Returns number of CPUs on system that are useful for math.
*/
int get_math_cpu_count() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
if (cpu_count < 1) {
return get_num_physical_cores();
}
if (is_hybrid_cpu()) {
cpu_set_t affinity;
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
int result = count_math_cpus(cpu_count);
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
if (result > 0) {
return result;
}
}
}
#endif
return get_num_physical_cores();
}

void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
Expand Down Expand Up @@ -1167,6 +1241,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
);
return true;
}
if (arg == "-j" || arg == "--json-schema") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
return true;
}
if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1374,6 +1456,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
printf(" --grammar-file FNAME file to read grammar from\n");
printf(" -j SCHEMA, --json-schema SCHEMA\n");
printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
printf(" --cfg-negative-prompt PROMPT\n");
printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n");
Expand Down Expand Up @@ -1766,6 +1851,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
Expand Down Expand Up @@ -2213,7 +2300,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}

{
if (params.warmup) {
LOG("warming up the model with an empty run\n");

std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
Expand Down
8 changes: 7 additions & 1 deletion llama.cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#pragma once

#include "llamafile/log.h"
#include "llama.h"

#include "sampling.h"
Expand Down Expand Up @@ -43,6 +44,7 @@ extern char const *LLAMA_BUILD_TARGET;

struct llama_control_vector_load_info;

int get_math_cpu_count();
int32_t get_num_physical_cores();

//
Expand All @@ -52,7 +54,7 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = llamafile_get_math_cpu_count();
int32_t n_threads = get_math_cpu_count();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
Expand Down Expand Up @@ -84,6 +86,9 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold

ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
Expand Down Expand Up @@ -160,6 +165,7 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
Expand Down
13 changes: 4 additions & 9 deletions llama.cpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,22 @@
// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi

#include "ggml-backend-impl.h"
#include "ggml-alloc.h"
#include "ggml-impl.h"

#include <assert.h>
#include <limits.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdlib.h>

#include "ggml-alloc.h"
#include "ggml-cuda.h"
#include "ggml-impl.h"
#include "ggml-metal.h"
#include "llamafile/log.h"

#ifndef NDEBUG
#define NDEBUG // [jart] delete printf debugging
#endif


#define MAX(a, b) ((a) > (b) ? (a) : (b))


// backend buffer type

const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
Expand Down Expand Up @@ -2107,6 +2100,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
return true;
}

#include "llamafile/log.h"

GGML_CALL static void system_exit(int rc) {
exit(rc);
}
Expand Down
Loading

0 comments on commit 850f43d

Please sign in to comment.