diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1f2c55f2dccdf..be23ad1699391 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1095,6 +1095,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); task_result res; res.id = id; + res.stop = false; res.error = true; res.result_json = { { "content", error } }; queue_results.push_back(res); @@ -1255,6 +1256,7 @@ struct llama_server_context std::lock_guard lock(mutex_tasks); task_server task; task.id = id_gen++; + task.target_id = 0; task.data = data; task.infill_mode = infill; task.embedding_mode = embedding; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 50e03de500747..f0db7ae357a2f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -8057,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); #endif return false; }