Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : reduce hash table reset cost #8698

Merged
merged 4 commits into from
Jul 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
endif
else
MK_CPPFLAGS += -DNDEBUG
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
MK_NVCCFLAGS += -O3
MK_CFLAGS += -O3 -g
MK_CXXFLAGS += -O3 -g
MK_NVCCFLAGS += -O3 -g
endif

ifdef LLAMA_SANITIZE_THREAD
Expand Down
2 changes: 1 addition & 1 deletion examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
printf("%12.4f", v);
sum += v;
Expand Down
4 changes: 2 additions & 2 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
Expand Down Expand Up @@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
if (m_params.verbosity > 1) {
Expand Down
6 changes: 3 additions & 3 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
case JSON: return "json";
case MARKDOWN: return "md";
case SQL: return "sql";
default: GGML_ASSERT(!"invalid output format");
default: GGML_ABORT("invalid output format");
}
}

Expand All @@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
case LLAMA_SPLIT_MODE_LAYER: return "layer";
case LLAMA_SPLIT_MODE_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode");
default: GGML_ABORT("invalid split mode");
}
}

Expand Down Expand Up @@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
case SQL:
return std::unique_ptr<printer>(new sql_printer());
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}

int main(int argc, char ** argv) {
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = peg_0;
}
else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

Expand Down
2 changes: 1 addition & 1 deletion examples/tokenize/tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
printf(">");
return;
}
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
}

LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
Expand Down
37 changes: 21 additions & 16 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,18 +254,8 @@

#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))

#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
abort(); \
} \
} while (0)

#ifndef NDEBUG
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER)
Expand All @@ -274,6 +264,17 @@
#define GGML_UNREACHABLE() ((void) 0)
#endif

#ifdef __cplusplus
#define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
#define GGML_NORETURN __declspec(noreturn)
#else
#define GGML_NORETURN _Noreturn
#endif

#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)

// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
Expand Down Expand Up @@ -322,6 +323,9 @@
extern "C" {
#endif

GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);

enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
Expand Down Expand Up @@ -636,8 +640,11 @@ extern "C" {
GGML_CGRAPH_EVAL_ORDER_COUNT
};

typedef uint32_t ggml_bitset_t;

struct ggml_hash_set {
size_t size;
ggml_bitset_t * used;
struct ggml_tensor ** keys;
};

Expand All @@ -651,7 +658,7 @@ extern "C" {
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;

struct ggml_hash_set visited_hash_table;
struct ggml_hash_set visited_hash_set;

enum ggml_cgraph_eval_order order;
};
Expand Down Expand Up @@ -698,8 +705,6 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);

GGML_API void ggml_print_backtrace(void);

// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);

Expand Down Expand Up @@ -2005,8 +2010,8 @@ extern "C" {

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
Expand Down
42 changes: 18 additions & 24 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ASSERT(!"not enough space in the buffer");
return;
GGML_ABORT("not enough space in the buffer");
}

void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
Expand Down Expand Up @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
return;
}
}
GGML_ASSERT(!"out of allocated_tensors");
GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
Expand All @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
return;
}
}
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
}
#endif

Expand Down Expand Up @@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
GGML_UNREACHABLE();
GGML_ABORT("not enough space in the buffer");
}
}

Expand Down Expand Up @@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
}
}

free(galloc->hash_set.keys);
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffers);
Expand All @@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
typedef struct ggml_gallocr * ggml_gallocr_t;

static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i];
}

Expand Down Expand Up @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {

static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);

// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
Expand Down Expand Up @@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}

bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size;
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;

// initialize hash table
if (galloc->hash_set.size < hash_size) {
free(galloc->hash_set.keys);
free(galloc->hash_values);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
if (galloc->hash_set.size < min_hash_size) {
ggml_hash_set_free(&galloc->hash_set);
galloc->hash_set = ggml_hash_set_new(min_hash_size);
GGML_ASSERT(galloc->hash_set.keys != NULL);

free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}

// reset allocators
Expand Down Expand Up @@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}

static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}

Expand Down
Loading
Loading