Skip to content

Commit

Permalink
Merged the upstream updates for model loading code, and ditched the l…
Browse files Browse the repository at this point in the history
…egacy llama loaders since they were no longer needed.
  • Loading branch information
LostRuins committed Apr 10, 2023
2 parents 18a1547 + 180b693 commit f53238f
Show file tree
Hide file tree
Showing 20 changed files with 1,240 additions and 1,452 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ LDFLAGS =

#lets try enabling everything
CFLAGS += -pthread -s
CXXFLAGS += -pthread -s
CXXFLAGS += -pthread -s -Wno-multichar

# OS specific
# TODO: support Windows
Expand Down Expand Up @@ -121,7 +121,7 @@ BLAS_BUILD =
ifeq ($(OS),Windows_NT)
BLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_blas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o libopenblas.lib -shared -o koboldcpp_blas.dll $(LDFLAGS)
else
BLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use openblas, please install it seperately, then link it manually with LLAMA_OPENBLAS=1'
BLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use openblas, please install it seperately, then link it manually with LLAMA_OPENBLAS=1. This is just a reminder, not an error.'
endif

#
Expand Down Expand Up @@ -154,7 +154,7 @@ ggml_blas.o: ggml.c ggml.h
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
$(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o

llama.o: llama.cpp llama.h
llama.o: llama.cpp llama.h llama_internal.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

common.o: examples/common.cpp examples/common.h
Expand Down
9 changes: 6 additions & 3 deletions examples/common.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "common.h"

#include "ggml.h"

#include <cassert>
#include <cstring>
#include <fstream>
Expand Down Expand Up @@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
Expand Down Expand Up @@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (ggml_mlock_supported()) {
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
Expand Down
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
Expand Down
1 change: 1 addition & 0 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;

Expand Down
1 change: 1 addition & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;

ctx = llama_init_from_file(params.model.c_str(), lparams);
Expand Down
1 change: 1 addition & 0 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;

Expand Down
9 changes: 4 additions & 5 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include "llama_internal.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -266,15 +267,13 @@ int main(int argc, char ** argv) {
}
}

// Sort tensors for consistent output
const auto tensors = llama_internal_get_tensor_map(ctx);
std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
const auto &tensors = llama_internal_get_tensor_map(ctx);

// check layer tensors
int included_layers = 0;
int64_t max_nelements = 0;
bool is_f16 = false;
for (const auto& kv_tensor : tensors_sorted) {
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
Expand Down Expand Up @@ -315,7 +314,7 @@ int main(int argc, char ** argv) {

error_stats global_stats {};

for (const auto& kv_tensor : tensors_sorted) {
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
Expand Down
1 change: 1 addition & 0 deletions expose.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ struct load_model_inputs
const bool f16_kv;
const char *model_filename;
const int n_parts_overwrite = -1;
const bool use_mmap;
};
struct generation_inputs
{
Expand Down
78 changes: 0 additions & 78 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,17 +97,6 @@ typedef void* thread_ret_t;
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif

#define GGML_MLOCK_SUPPORT 0

#ifdef __has_include
#if __has_include(<sys/mman.h>)
#undef GGML_MLOCK_SUPPORT
#define GGML_MLOCK_SUPPORT 1
#include <sys/mman.h>
#endif
#endif


/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
Expand Down Expand Up @@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {

static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");

//
// ggml object
//

struct ggml_object {
size_t offs;
size_t size;

struct ggml_object * next;

char padding[8];
};

static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");

Expand All @@ -2716,7 +2690,6 @@ struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
bool mem_buffer_mlocked;
bool no_alloc;

int n_objects;
Expand Down Expand Up @@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.mem_size =*/ params.mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.mem_buffer_mlocked =*/ false,
/*.no_alloc =*/ params.no_alloc,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
Expand Down Expand Up @@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);

#if GGML_MLOCK_SUPPORT
if (ctx->mem_buffer_mlocked) {
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
}
}
#endif

if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer);
}
Expand Down Expand Up @@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}

#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif

bool ggml_mlock_supported(void) {
return GGML_MLOCK_SUPPORT;
}

bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p) {
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
#if GGML_MLOCK_SUPPORT
if (ctx->mem_buffer_mlocked) {
return true;
}
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
(opt_extra_len &&
mlock(opt_extra_addr, opt_extra_len))) {
if ((*err_p = malloc(1024))) {
snprintf(*err_p, 1024,
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
ctx->mem_size + opt_extra_len,
strerror(errno));
}
return false;
}
ctx->mem_buffer_mlocked = true;
return true;
#else // GGML_MLOCK_SUPPORT
*err_p = strdup("can't mlock because it's not supported on this system");
return false;
#endif // GGML_MLOCK_SUPPORT
}

////////////////////////////////////////////////////////////////////////////////

struct ggml_tensor * ggml_new_tensor_impl(
Expand Down
20 changes: 13 additions & 7 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,19 @@ enum ggml_op {
GGML_OP_COUNT,
};


// ggml object
struct ggml_object {
size_t offs;
size_t size;

struct ggml_object * next;

char padding[8];
};

static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
Expand Down Expand Up @@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

bool ggml_mlock_supported(void);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);

struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
Expand Down
Binary file added koboldcpp.dll
Binary file not shown.
11 changes: 7 additions & 4 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class load_model_inputs(ctypes.Structure):
("batch_size", ctypes.c_int),
("f16_kv", ctypes.c_bool),
("model_filename", ctypes.c_char_p),
("n_parts_overwrite", ctypes.c_int)]
("n_parts_overwrite", ctypes.c_int),
("use_mmap", ctypes.c_bool)]

class generation_inputs(ctypes.Structure):
_fields_ = [("seed", ctypes.c_int),
Expand Down Expand Up @@ -53,14 +54,15 @@ def init_library():
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
handle.generate.restype = generation_outputs

def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6):
def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6,use_mmap=False):
inputs = load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
inputs.batch_size = batch_size
inputs.max_context_length = max_context_length #initial value to use for ctx, can be overwritten
inputs.threads = threads
inputs.n_parts_overwrite = n_parts_overwrite
inputs.f16_kv = True
inputs.use_mmap = use_mmap
ret = handle.load_model(inputs)
return ret

Expand Down Expand Up @@ -347,7 +349,7 @@ def main(args):
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
modelname = os.path.abspath(ggml_selected_file)
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads)
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
print("Load Model OK: " + str(loadok))

if not loadok:
Expand All @@ -369,7 +371,7 @@ def main(args):
if args.host=="":
epurl = f"http://localhost:{args.port}" + ("?streaming=1" if args.stream else "")
else:
epurl = f"http://{args.host}:{args.port}" + ("&streaming=1" if args.stream else "")
epurl = f"http://{args.host}:{args.port}" + ("?streaming=1" if args.stream else "")


print(f"Please connect to custom endpoint at {epurl}")
Expand All @@ -394,5 +396,6 @@ def main(args):
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
args = parser.parse_args()
main(args)
Binary file added koboldcpp_blas.dll
Binary file not shown.
Loading

0 comments on commit f53238f

Please sign in to comment.