Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

quantize: add imatrix and dataset metadata in GGUF #6658

Merged
merged 16 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Expand Down
82 changes: 44 additions & 38 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,48 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result;
}

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char* sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}

} else if (strncmp(sep, "str:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
kvo.str_value = strdup(sep);
phymbert marked this conversation as resolved.
Show resolved Hide resolved
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}

bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
llama_sampling_params& sparams = params.sparams;

Expand Down Expand Up @@ -1153,47 +1195,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
char* sep = strchr(argv[i], '=');
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
}
else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
}
else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
}
else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
}
else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
}
else {
if (!parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
params.kv_overrides.push_back(kvo);
return true;
}
#ifndef LOG_DISABLE_LOGS
Expand Down Expand Up @@ -1461,7 +1467,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -ptc N, --print-token-count N\n");
printf(" print token count every N tokens (default: %d)\n", params.n_print);
printf("\n");
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ struct gpt_params {
std::string image = ""; // path to an image file
};

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand Down
74 changes: 41 additions & 33 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct Stats {
};

struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
Expand All @@ -46,7 +47,7 @@ class IMatrixCollector {
std::vector<float> m_src1_data;
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name) const;
void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
};

Expand Down Expand Up @@ -184,32 +185,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}

void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
}

void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str());
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}

void IMatrixCollector::save_imatrix(const char * fname) const {
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries));
for (auto& p : m_stats) {
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & p : m_stats) {
int len = p.first.size();
out.write((const char*)&len, sizeof(len));
out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char*)&nval, sizeof(nval));
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
}

// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);

if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
}
}

Expand Down Expand Up @@ -532,6 +539,29 @@ int main(int argc, char ** argv) {
}
}

gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}

params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);

print_build_info();

if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}

sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));

if (!combine_files.empty()) {
Expand Down Expand Up @@ -570,28 +600,6 @@ int main(int argc, char ** argv) {
}
}

gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}

params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);

print_build_info();

if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}

fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}

llama_backend_init();
llama_numa_init(params.numa);

Expand Down
2 changes: 1 addition & 1 deletion examples/quantize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
74 changes: 33 additions & 41 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <algorithm>

struct quant_option {
std::string name;
Expand Down Expand Up @@ -53,6 +52,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};

static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.n_entries";

static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
Expand Down Expand Up @@ -112,7 +113,7 @@ static void usage(const char * executable) {
exit(1);
}

static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
static void load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
Expand Down Expand Up @@ -159,15 +160,27 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
}

// latest imatrix version contains the dataset filename at the end of the file
if (in.peek() != EOF) {
int dataset_len;
in.read((char *)&dataset_len, sizeof(dataset_len));
std::vector<char> dataset_as_vec(dataset_len+1);
in.read((char *)dataset_as_vec.data(), dataset_len);
dataset_as_vec[dataset_len] = 0;
imatrix_dataset = std::string{dataset_as_vec.data()};
phymbert marked this conversation as resolved.
Show resolved Hide resolved
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
}
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
}

static void prepare_imatrix(const std::string & imatrix_file,
std::string & imatrix_dataset,
const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data);
load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
}
if (imatrix_data.empty()) {
return;
Expand Down Expand Up @@ -210,43 +223,6 @@ static ggml_type parse_ggml_type(const char * arg) {
return result;
}

static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char* sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
return true;
}

int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
Expand Down Expand Up @@ -313,10 +289,26 @@ int main(int argc, char ** argv) {
usage(argv[0]);
}

std::string imatrix_dataset;
std::unordered_map<std::string, std::vector<float>> imatrix_data;
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
if (!imatrix_dataset.empty()) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
kvo.str_value = strdup(imatrix_dataset.c_str());
kv_overrides.emplace_back(std::move(kvo));
}

{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = imatrix_data.size();
kv_overrides.emplace_back(std::move(kvo));
}
}
if (!kv_overrides.empty()) {
kv_overrides.emplace_back();
Expand Down
Loading
Loading