Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORCA Format KV Cache Utilization in Inference Response Header #7839

Draft
wants to merge 7 commits into
base: r24.10
Choose a base branch
from
76 changes: 76 additions & 0 deletions src/http_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3225,6 +3225,32 @@ HTTPAPIServer::HandleGenerate(
req, RestrictedCategory::INFERENCE, restricted_apis_);

AddContentTypeHeader(req, "application/json");

// logic to add kv_cache metrics to reponse header
// Get the metrics in Prometheus format
TRITONSERVER_Metrics* metrics = nullptr;
TRITONSERVER_Error* err = TRITONSERVER_ServerMetrics(server_.get(), &metrics);
if (err == nullptr) {
const char* base;
size_t byte_size;
err = TRITONSERVER_MetricsFormatted(
metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size);
if (err == nullptr) {
std::string kv_utilization(base, byte_size);
// Extract the KV utilization metrics from the Prometheus formatted string.
std::string extracted_kv_metrics = ExtractKVMetrics(kv_utilization);
evhtp_headers_add_header(
req->headers_out,
evhtp_header_new("endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1));
}
}
TRITONSERVER_MetricsDelete(metrics);
// Handle potential errors
if (err != nullptr) {
LOG_ERROR << "Failed to get KV metrics: " << TRITONSERVER_ErrorMessage(err);
TRITONSERVER_ErrorDelete(err);
}

if (req->method != htp_method_POST) {
RETURN_AND_RESPOND_WITH_ERR(
req, EVHTP_RES_METHNALLOWED, "Method Not Allowed");
Expand Down Expand Up @@ -3381,6 +3407,56 @@ HTTPAPIServer::HandleGenerate(
request_release_payload.release();
}

std::string HTTPAPIServer::ExtractKVMetrics(
const std::string& prometheus_metrics) {
uint64_t tokens_per_block = 0;
uint64_t used_blocks = 0;
uint64_t max_blocks = 0;


const RE2 kv_cache_block_regex(
R"(nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=\"(?P<type>\w+)\",model=\"(?P<model>.*?)\",version=\"1\"}\s+(?P<value>\d+))");

re2::StringPiece input(prometheus_metrics);
std::string type, model, value;

while (RE2::FindAndConsume(&input, kv_cache_block_regex, &type, &model, &value)) {

uint64_t numeric_value = std::stoull(value);

if (type == "tokens_per") {
tokens_per_block = numeric_value;
} else if (type == "used") {
used_blocks = numeric_value;
} else if (type == "max") {
max_blocks = numeric_value;
}
}

// Calculate derived metrics
double kv_cache_utilization = 0.0;
if (max_blocks > 0) {
kv_cache_utilization = (double)used_blocks / max_blocks;
}
uint64_t max_token_capacity = max_blocks * tokens_per_block;

// Format the metrics according to the ORCA protocol
triton::common::TritonJson::Value orca_metrics(
triton::common::TritonJson::ValueType::OBJECT);
triton::common::TritonJson::Value named_metrics(
orca_metrics, triton::common::TritonJson::ValueType::OBJECT);

named_metrics.AddDouble("kv_cache_utilization", kv_cache_utilization);
named_metrics.AddUInt("max_token_capacity", max_token_capacity);

orca_metrics.Add("named_metrics", std::move(named_metrics));

triton::common::TritonJson::WriteBuffer buffer;
orca_metrics.Write(&buffer);

return std::string("JSON ") + buffer.Contents();
}

TRITONSERVER_Error*
HTTPAPIServer::ModelInputMetadata(
const std::string& model_name, const int64_t model_version,
Expand Down
5 changes: 5 additions & 0 deletions src/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,11 @@ class HTTPAPIServer : public HTTPServer {
evhtp_request_t* req, const std::string& model_name,
const std::string& model_version_str, bool streaming);

// Helper function to set get the KV-cache utilization metrics for the
// infer response header
std::string ExtractKVMetrics(
const std::string& prometheus_metrics);

// 'meta_data_root' is the root JSON document for 'input_metadata'.
// In TritonJson, the Value objects are references to the root document.
// Therefore the document must stay valid.
Expand Down