From 3d47404404d922cc60751a56b1ed9a92995d1d28 Mon Sep 17 00:00:00 2001
From: Ken Giusti <kgiusti@redhat.com>
Date: Mon, 25 Sep 2023 11:48:51 -0400
Subject: [PATCH] Fixes #1182: Refactor Prometheus metrics reporting
 (ISSUE-1182 part 2) (#1200)

o Implement dynamic registration of each alloc_pool metric
 o Refactor http-libwebsockets.c /metrics implementation
 o Update unit tests
---
 docs/notes/prometheus.adoc         | 111 +++++
 include/qpid/dispatch/alloc_pool.h |   4 +-
 src/alloc_pool.c                   |  22 +-
 src/http-libwebsockets.c           | 649 +++++++++++++++++++----------
 src/http.h                         |   5 +
 tests/system_tests_http.py         |  87 ++--
 6 files changed, 613 insertions(+), 265 deletions(-)
 create mode 100644 docs/notes/prometheus.adoc

diff --git a/docs/notes/prometheus.adoc b/docs/notes/prometheus.adoc
new file mode 100644
index 000000000..e9e6284bb
--- /dev/null
+++ b/docs/notes/prometheus.adoc
@@ -0,0 +1,111 @@
+////
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License
+////
+
+= Monitoring Router Metrics Via Prometheus
+
+The router can be configured to enable metrics scraping via
+Prometheus. Metrics are provided via an HTTP service running in the
+router. A snapshot of the metrics can be obtained by issuing an HTTP
+GET request to the service for the */metrics* URL path.
+
+== Configuration
+
+Prometheus support is disabled by default. To enable metrics access an
+HTTP service must be configured on the router. This is done by
+specifying an *io.skupper.router.listener* entry in the router
+configuration (or via in-band management). The listener entry must
+provide:
+
+* The host IP address/name
+* The TCP port number
+* The _http_ attribute set to True
+
+For example, the following listener entry enables an HTTP server
+listening on localhost port 22967.
+
+  listener {
+    port: 22976
+    http: True
+    host: localhost
+    saslMechanisms: ANONYMOUS
+    idleTimeoutSeconds: 120
+    authenticatePeer: no
+    role: normal
+  }
+
+The Prometheus server must also be configured to scrape the
+router. This requires adding a job in the Prometheus server's
+*scrape-config* configuration for the router. An example job
+configuration for the above example listener could be:
+
+  scrape_configs:
+    - job_name: skupper-router
+      metrics_path: /metrics
+      static_configs:
+        - targets:
+            - localhost:22976
+
+== Metrics
+
+The metrics provided by the router are intended for use by developers
+to aid fault monitoring and debugging. Therefore the metrics content
+may change between releases as features are added or removed.
+
+=== Heap Allocation Metrics
+
+A subset of the router metrics are concerned with the router's heap
+memory utilization. The router uses a cache to manage instances of
+data objects that have been allocated from the heap. This cache avoids
+the overhead of allocating and freeing frequently used data objects
+from the system's heap.
+
+See alloc_pool.c for implementation details.
+
+The cache is a pool of data objects that have been allocated from the
+heap for use by the router. Each data type has its own dedicated
+cache. When the router needs an instance of said data type it will
+first attempt to claim an object from the cache. If the cache is
+empty, the router will instead allocate a batch of data objects from
+the system heap. It will reserve one data object instance from the
+batch for immediate use and place the remaining into the cache. When
+the router no longer needs a particular instance of a data object it
+will be placed back into the cache and can be re-used at a later time.
+
+Given this implementation, a particular instance of a data object may
+be either:
+
+* in the cache (in standby - available for use when needed)
+* or currently in use by the router.
+
+Each data type will have a set of 4 metrics associated with it:
+
+* allocated: total number of objects that are currently allocated from the heap
+* in_use: total objects currently being used by the router
+* cached: total objects in the cache
+* memory_use_bytes: the sum of all memory allocated from the heap for the given data type
+
+These metrics adhere to the following relationships:
+
+* allocated = in_use + cached
+* memory_use_bytes = (sizeof(<type>) * allocated)
+
+
+
+
+
diff --git a/include/qpid/dispatch/alloc_pool.h b/include/qpid/dispatch/alloc_pool.h
index 49c9efb61..f30c3e3de 100644
--- a/include/qpid/dispatch/alloc_pool.h
+++ b/include/qpid/dispatch/alloc_pool.h
@@ -122,7 +122,7 @@ static inline void *qd_alloc_deref_safe_ptr(const qd_alloc_safe_ptr_t *sp)
  */
 void qd_alloc_desc_init(const char *name, qd_alloc_type_desc_t *desc, size_t size, const size_t *additional_size,
                         const qd_alloc_config_t *config);
-qd_alloc_stats_t qd_alloc_desc_stats(qd_alloc_type_desc_t *desc);
+qd_alloc_stats_t qd_alloc_desc_stats(const qd_alloc_type_desc_t *desc);  // thread safe
 // clang-format off
 #define ALLOC_DEFINE_CONFIG(T,S,A,C)                                    \
     qd_alloc_type_desc_t __desc_##T  __attribute__((aligned(64)));      \
@@ -152,5 +152,5 @@ qd_alloc_stats_t qd_alloc_desc_stats(qd_alloc_type_desc_t *desc);
 void qd_alloc_initialize(void);
 void qd_alloc_debug_dump(const char *file);
 void qd_alloc_finalize(void);
-
+size_t qd_alloc_type_size(const qd_alloc_type_desc_t *desc);  // thread safe
 #endif
diff --git a/src/alloc_pool.c b/src/alloc_pool.c
index 86a95f6aa..1ea9ef2cc 100644
--- a/src/alloc_pool.c
+++ b/src/alloc_pool.c
@@ -24,6 +24,7 @@
 #include "config.h"
 #include "entity.h"
 #include "entity_cache.h"
+#include "http.h"
 #include "qd_asan_interface.h"
 
 #include "qpid/dispatch/alloc.h"
@@ -500,9 +501,17 @@ void qd_alloc_initialize(void)
         desc->debug = (void *) items;
 #endif
 
+        // cycle the lock to flush the initialized desc before handing it off to other threads (avoids a spurious tsan
+        // error)
+
+        sys_mutex_lock(&desc->lock);
+        sys_mutex_unlock(&desc->lock);
+
         // now add the descriptor to the management entity database
+        // and telemetry metrics
 
         qd_entity_cache_add(QD_ALLOCATOR_TYPE, desc);
+        qd_http_add_alloc_metric(desc->type_name, desc);
     }
 
 #ifdef QD_MEMORY_DEBUG
@@ -545,6 +554,7 @@ void qd_alloc_finalize(void)
 
     for (qd_alloc_type_desc_t *desc = DEQ_HEAD(desc_list); desc; desc = DEQ_NEXT(desc)) {
         qd_entity_cache_remove(QD_ALLOCATOR_TYPE, desc);
+        qd_http_remove_alloc_metric(desc->type_name);
 
         //
         // Reclaim the items on the global free pool
@@ -672,15 +682,21 @@ QD_EXPORT qd_error_t qd_entity_refresh_allocator(qd_entity_t* entity, void *impl
     return qd_error_code();
 }
 
-qd_alloc_stats_t qd_alloc_desc_stats(qd_alloc_type_desc_t *desc)
+qd_alloc_stats_t qd_alloc_desc_stats(const qd_alloc_type_desc_t *desc)
 {
-    sys_mutex_lock(&desc->lock);
+    sys_mutex_t *lock = (sys_mutex_t *) &desc->lock;  // cast away const
+    sys_mutex_lock(lock);
     qd_alloc_stats_t stats = desc->stats;
-    sys_mutex_unlock(&desc->lock);
+    sys_mutex_unlock(lock);
 
     return stats;
 }
 
+size_t qd_alloc_type_size(const qd_alloc_type_desc_t *desc)
+{
+    return desc->total_size;
+}
+
 void qd_alloc_debug_dump(const char *file) {
     debug_dump = file ? strdup(file) : 0;
 }
diff --git a/src/http-libwebsockets.c b/src/http-libwebsockets.c
index 3921c3882..b6690f309 100644
--- a/src/http-libwebsockets.c
+++ b/src/http-libwebsockets.c
@@ -22,8 +22,10 @@
 #include "server_private.h"
 #include "qd_connection.h"
 
+#include "qpid/dispatch/alloc_pool.h"
 #include "qpid/dispatch/amqp.h"
 #include "qpid/dispatch/atomic.h"
+#include "qpid/dispatch/ctools.h"
 #include "qpid/dispatch/protocol_adaptor.h"
 #include "qpid/dispatch/threading.h"
 #include "qpid/dispatch/timer.h"
@@ -96,18 +98,28 @@ typedef struct connection_t {
     struct lws *wsi;
 } connection_t;
 
+// Instantiated for every HTTP request, this holds the statistics to be written in the response
+//
 typedef struct stats_request_state_t {
-    bool callback_completed;
-    bool wsi_deleted;
+    bool callback_completed;  // T: the core has written the global statistics to the stats field
+    bool wsi_deleted;         // T: client has closed, may release this state instance
     qdr_global_stats_t stats;
     qd_http_server_t *server;
     struct lws *wsi;
+    size_t buffer_size;       // extra octets past lws_prefix[LWS_PRE] for HTTP output
+    uint8_t lws_prefix[LWS_PRE];
+    // buffer_size extra octets are appended to this structure when it is allocated. This space is used for the HTTP
+    // response. See new_stats_request_state(), Use &lws_prefix[LWS_PRE] as the start of output buffer.
 } stats_request_state_t;
+static stats_request_state_t *new_stats_request_state(size_t buffer_size);
+static void free_stats_request_state(stats_request_state_t *);
 
+// Context passed to metrics and healthz protocol callbacks. Instantiated by the LWS thread, represents a single HTTP
+// request transaction.
+//
 typedef struct stats_t {
-    size_t current;
-    bool headers_sent;
-    stats_request_state_t *context;
+    stats_request_state_t *state;
+    bool response_complete;  // T: HTTP response sent
 } stats_t;
 
 /* Navigating from WSI pointer to qd objects */
@@ -445,14 +457,53 @@ static void connection_wake(qd_connection_t *qd_conn)
     }
 }
 
+//
+// Metrics
+//
+// Metrics are reported via an HTTP get request on the url "http://<router>/metrics". The metrics in the HTTP response
+// are formatted for consumption by Prometheus - see the description "exposition formats" at the Prometheus website for
+// details.
+//
+// Each metric is rendered as ASCII text. Two lines of text are generated for each metric. The format of these lines
+// are (minus quotes):
+//
+// "# TYPE <metric-name> <metric-type>\n"
+// "<metric-name> <value>\n"
+//
+// Currently all metric values are uint64_t integers, and the metric-type is either "counter" or "gauge". Counters are
+// those metrics that only increase (may allow reset to zero). Gauges are those metrics whose values may increase or
+// decrease over time.
+//
+// The current http-response body buffering implementation is... interesting. When a request arrives all counters are
+// fetched then rendered into an output buffer. After all metrics have been written to the buffer the buffer is written
+// to the LWS internal network buffer(s). Metrics can either be fetch synchronously (alloc-pool metrics) or require an
+// asynchronous callback (router core metrics). The output buffer rendering and writing all occur on the http thread -
+// see callback_metrics().
+//
+// Given this implementation it is necessary to ensure that the output buffer is large enough to hold all metrics. The
+// following definitions are used to compute the necessary buffer size. These values may need updating occasionally
+// should metric be added/removed. I've added many debug asserts to prevent accidental buffer overflow should the
+// metrics not be updated properly.
+//
+// TODO(kgiusti): refactor this to use a smaller buffer with a chunked-output approach if possible with LWS.
+#define MAX_METRIC_NAME_LEN  48
+#define MAX_METRIC_VALUE_LEN 20  // uint64_t in decimal
+#define MAX_METRIC_TYPE_LEN  7   // strlen("counter")
+#define PER_METRIC_BUF_SIZE ((2 * MAX_METRIC_NAME_LEN) + MAX_METRIC_VALUE_LEN + MAX_METRIC_TYPE_LEN + 11)
+#define PER_ALLOC_METRIC_COUNT 4  // 4 metrics per alloc type
+
+#define HTTP_HEADER_LEN 128  // reserve space for headers added by LWS (128 is a guess, asserted in callback).
+#define HEALTHZ_BUF_SIZE 2048 // for /healthz url response data
+
+
 /**
- * Called on router worker thread
+ * Called on router worker thread: passes latest router stats to the http thread for processing
  */
 static void handle_stats_results(void *context, bool discard)
 {
     stats_request_state_t* state = (stats_request_state_t*) context;
     if (state->wsi_deleted || discard) {
-        free(state);
+        free_stats_request_state(state);
     } else {
         qd_http_server_t *hs = state->server;
         if (hs) {
@@ -463,90 +514,48 @@ static void handle_stats_results(void *context, bool discard)
 }
 
 /**
- * Called on http thread
+ * Called on http thread: process the stats arriving from the router thread
  */
 static void handle_stats_result_HT(stats_request_state_t* state)
 {
     if (state->wsi_deleted) {
-        free(state);
+        free_stats_request_state(state);
     } else {
         state->callback_completed = true;
         lws_callback_on_writable(state->wsi);
     }
 }
 
-typedef int (*int_metric) (qdr_global_stats_t *stats);
+typedef uint64_t (*uint64_metric) (const qdr_global_stats_t *stats);
 typedef struct metric_definition {
     const char* name;
     const char* type;
-    int_metric value;
+    uint64_metric get_value;
 } metric_definition;
 
-typedef struct allocator_metric_definition {
-    const char* name;
-    qd_alloc_stats_t (*fn)(void);
-} allocator_metric_definition;
-
-static int stats_get_connections(qdr_global_stats_t *stats) { return stats->connections; }
-static int stats_get_links(qdr_global_stats_t *stats) { return stats->links; }
-static int stats_get_addrs(qdr_global_stats_t *stats) { return stats->addrs; }
-static int stats_get_routers(qdr_global_stats_t *stats) { return stats->routers; }
-static int stats_get_auto_links(qdr_global_stats_t *stats) { return stats->auto_links; }
-static int stats_get_presettled_deliveries(qdr_global_stats_t *stats) { return stats->presettled_deliveries; }
-static int stats_get_dropped_presettled_deliveries(qdr_global_stats_t *stats) { return stats->dropped_presettled_deliveries; }
-static int stats_get_accepted_deliveries(qdr_global_stats_t *stats) { return stats->accepted_deliveries; }
-static int stats_get_released_deliveries(qdr_global_stats_t *stats) { return stats->released_deliveries; }
-static int stats_get_rejected_deliveries(qdr_global_stats_t *stats) { return stats->rejected_deliveries; }
-static int stats_get_modified_deliveries(qdr_global_stats_t *stats) { return stats->modified_deliveries; }
-static int stats_get_deliveries_ingress(qdr_global_stats_t *stats) { return stats->deliveries_ingress; }
-static int stats_get_deliveries_egress(qdr_global_stats_t *stats) { return stats->deliveries_egress; }
-static int stats_get_deliveries_transit(qdr_global_stats_t *stats) { return stats->deliveries_transit; }
-static int stats_get_deliveries_ingress_route_container(qdr_global_stats_t *stats) { return stats->deliveries_ingress_route_container; }
-static int stats_get_deliveries_egress_route_container(qdr_global_stats_t *stats) { return stats->deliveries_egress_route_container; }
-static int stats_get_deliveries_delayed_1sec(qdr_global_stats_t *stats) { return stats->deliveries_delayed_1sec; }
-static int stats_get_deliveries_delayed_10sec(qdr_global_stats_t *stats) { return stats->deliveries_delayed_10sec; }
-static int stats_get_deliveries_stuck(qdr_global_stats_t *stats) { return stats->deliveries_stuck; }
-static int stats_get_links_blocked(qdr_global_stats_t *stats) { return stats->links_blocked; }
-static int stats_get_deliveries_redirected_to_fallback(qdr_global_stats_t *stats) { return stats->deliveries_redirected_to_fallback; }
-
-qd_alloc_stats_t alloc_stats_qd_bitmask_t(void);
-qd_alloc_stats_t alloc_stats_qd_buffer_t(void);
-qd_alloc_stats_t alloc_stats_qd_composed_field_t(void);
-qd_alloc_stats_t alloc_stats_qd_composite_t(void);
-qd_alloc_stats_t alloc_stats_qd_connection_t(void);
-qd_alloc_stats_t alloc_stats_qd_hash_handle_t(void);
-qd_alloc_stats_t alloc_stats_qd_hash_item_t(void);
-qd_alloc_stats_t alloc_stats_qd_iterator_t(void);
-qd_alloc_stats_t alloc_stats_qd_link_ref_t(void);
-qd_alloc_stats_t alloc_stats_qd_link_t(void);
-qd_alloc_stats_t alloc_stats_qd_listener_t(void);
-qd_alloc_stats_t alloc_stats_qd_log_entry_t(void);
-qd_alloc_stats_t alloc_stats_qd_management_context_t(void);
-qd_alloc_stats_t alloc_stats_qd_message_content_t(void);
-qd_alloc_stats_t alloc_stats_qd_message_t(void);
-qd_alloc_stats_t alloc_stats_qd_node_t(void);
-qd_alloc_stats_t alloc_stats_qd_parse_node_t(void);
-qd_alloc_stats_t alloc_stats_qd_parsed_field_t(void);
-qd_alloc_stats_t alloc_stats_qd_timer_t(void);
-qd_alloc_stats_t alloc_stats_qdr_action_t(void);
-qd_alloc_stats_t alloc_stats_qdr_address_config_t(void);
-qd_alloc_stats_t alloc_stats_qdr_address_t(void);
-qd_alloc_stats_t alloc_stats_qdr_connection_info_t(void);
-qd_alloc_stats_t alloc_stats_qdr_connection_t(void);
-qd_alloc_stats_t alloc_stats_qdr_connection_work_t(void);
-qd_alloc_stats_t alloc_stats_qdr_core_timer_t(void);
-qd_alloc_stats_t alloc_stats_qdr_delivery_cleanup_t(void);
-qd_alloc_stats_t alloc_stats_qdr_delivery_ref_t(void);
-qd_alloc_stats_t alloc_stats_qdr_delivery_t(void);
-qd_alloc_stats_t alloc_stats_qdr_field_t(void);
-qd_alloc_stats_t alloc_stats_qdr_general_work_t(void);
-qd_alloc_stats_t alloc_stats_qdr_link_ref_t(void);
-qd_alloc_stats_t alloc_stats_qdr_link_t(void);
-qd_alloc_stats_t alloc_stats_qdr_link_work_t(void);
-qd_alloc_stats_t alloc_stats_qdr_query_t(void);
-qd_alloc_stats_t alloc_stats_qdr_terminus_t(void);
-
-static struct metric_definition metrics[] = {
+static uint64_t stats_get_connections(const qdr_global_stats_t *stats) { return stats->connections; }
+static uint64_t stats_get_links(const qdr_global_stats_t *stats) { return stats->links; }
+static uint64_t stats_get_addrs(const qdr_global_stats_t *stats) { return stats->addrs; }
+static uint64_t stats_get_routers(const qdr_global_stats_t *stats) { return stats->routers; }
+static uint64_t stats_get_auto_links(const qdr_global_stats_t *stats) { return stats->auto_links; }
+static uint64_t stats_get_presettled_deliveries(const qdr_global_stats_t *stats) { return stats->presettled_deliveries; }
+static uint64_t stats_get_dropped_presettled_deliveries(const qdr_global_stats_t *stats) { return stats->dropped_presettled_deliveries; }
+static uint64_t stats_get_accepted_deliveries(const qdr_global_stats_t *stats) { return stats->accepted_deliveries; }
+static uint64_t stats_get_released_deliveries(const qdr_global_stats_t *stats) { return stats->released_deliveries; }
+static uint64_t stats_get_rejected_deliveries(const qdr_global_stats_t *stats) { return stats->rejected_deliveries; }
+static uint64_t stats_get_modified_deliveries(const qdr_global_stats_t *stats) { return stats->modified_deliveries; }
+static uint64_t stats_get_deliveries_ingress(const qdr_global_stats_t *stats) { return stats->deliveries_ingress; }
+static uint64_t stats_get_deliveries_egress(const qdr_global_stats_t *stats) { return stats->deliveries_egress; }
+static uint64_t stats_get_deliveries_transit(const qdr_global_stats_t *stats) { return stats->deliveries_transit; }
+static uint64_t stats_get_deliveries_ingress_route_container(const qdr_global_stats_t *stats) { return stats->deliveries_ingress_route_container; }
+static uint64_t stats_get_deliveries_egress_route_container(const qdr_global_stats_t *stats) { return stats->deliveries_egress_route_container; }
+static uint64_t stats_get_deliveries_delayed_1sec(const qdr_global_stats_t *stats) { return stats->deliveries_delayed_1sec; }
+static uint64_t stats_get_deliveries_delayed_10sec(const qdr_global_stats_t *stats) { return stats->deliveries_delayed_10sec; }
+static uint64_t stats_get_deliveries_stuck(const qdr_global_stats_t *stats) { return stats->deliveries_stuck; }
+static uint64_t stats_get_links_blocked(const qdr_global_stats_t *stats) { return stats->links_blocked; }
+static uint64_t stats_get_deliveries_redirected_to_fallback(const qdr_global_stats_t *stats) { return stats->deliveries_redirected_to_fallback; }
+
+static const struct metric_definition metrics[] = {
     {"qdr_connections_total", "gauge", stats_get_connections},
     {"qdr_links_total", "gauge", stats_get_links},
     {"qdr_addresses_total", "gauge", stats_get_addrs},
@@ -569,102 +578,152 @@ static struct metric_definition metrics[] = {
     {"qdr_links_blocked_total", "gauge", stats_get_links_blocked},
     {"qdr_deliveries_redirected_to_fallback_total", "counter", stats_get_deliveries_redirected_to_fallback}
 };
-static size_t metrics_length = sizeof(metrics)/sizeof(metrics[0]);
-
-static struct allocator_metric_definition allocator_metrics[] = {
-        {"qdr_allocator_qd_bitmask_t", alloc_stats_qd_bitmask_t},
-        {"qdr_allocator_qd_buffer_t", alloc_stats_qd_buffer_t},
-        {"qdr_allocator_qd_composed_field_t", alloc_stats_qd_composed_field_t},
-        {"qdr_allocator_qd_composite_t", alloc_stats_qd_composite_t},
-        {"qdr_allocator_qd_connection_t", alloc_stats_qd_connection_t},
-        {"qdr_allocator_qd_hash_handle_t", alloc_stats_qd_hash_handle_t},
-        {"qdr_allocator_qd_hash_item_t", alloc_stats_qd_hash_item_t},
-        {"qdr_allocator_qd_iterator_t", alloc_stats_qd_iterator_t},
-        {"qdr_allocator_qd_link_ref_t", alloc_stats_qd_link_ref_t},
-        {"qdr_allocator_qd_link_t", alloc_stats_qd_link_t},
-        {"qdr_allocator_qd_listener_t", alloc_stats_qd_listener_t},
-        {"qdr_allocator_qd_log_entry_t", alloc_stats_qd_log_entry_t},
-        {"qdr_allocator_qd_management_context_t", alloc_stats_qd_management_context_t},
-        {"qdr_allocator_qd_message_content_t", alloc_stats_qd_message_content_t},
-        {"qdr_allocator_qd_message_t", alloc_stats_qd_message_t},
-        {"qdr_allocator_qd_node_t", alloc_stats_qd_node_t},
-        {"qdr_allocator_qd_parse_node_t", alloc_stats_qd_parse_node_t},
-        {"qdr_allocator_qd_parsed_field_t", alloc_stats_qd_parsed_field_t},
-        {"qdr_allocator_qd_timer_t", alloc_stats_qd_timer_t},
-        {"qdr_allocator_qdr_action_t", alloc_stats_qdr_action_t},
-        {"qdr_allocator_qdr_address_config_t", alloc_stats_qdr_address_config_t},
-        {"qdr_allocator_qdr_address_t", alloc_stats_qdr_address_t},
-        {"qdr_allocator_qdr_connection_info_t", alloc_stats_qdr_connection_info_t},
-        {"qdr_allocator_qdr_connection_t", alloc_stats_qdr_connection_t},
-        {"qdr_allocator_qdr_connection_work_t", alloc_stats_qdr_connection_work_t},
-        {"qdr_allocator_qdr_core_timer_t", alloc_stats_qdr_core_timer_t},
-        {"qdr_allocator_qdr_delivery_cleanup_t", alloc_stats_qdr_delivery_cleanup_t},
-        {"qdr_allocator_qdr_delivery_ref_t", alloc_stats_qdr_delivery_ref_t},
-        {"qdr_allocator_qdr_delivery_t", alloc_stats_qdr_delivery_t},
-        {"qdr_allocator_qdr_field_t", alloc_stats_qdr_field_t},
-        {"qdr_allocator_qdr_general_work_t", alloc_stats_qdr_general_work_t},
-        {"qdr_allocator_qdr_link_ref_t", alloc_stats_qdr_link_ref_t},
-        {"qdr_allocator_qdr_link_t", alloc_stats_qdr_link_t},
-        {"qdr_allocator_qdr_link_work_t", alloc_stats_qdr_link_work_t},
-        {"qdr_allocator_qdr_query_t", alloc_stats_qdr_query_t},
-        {"qdr_allocator_qdr_terminus_t", alloc_stats_qdr_terminus_t}
+static const size_t metrics_length = sizeof(metrics)/sizeof(metrics[0]);
+
+//
+// Metrics provided by the alloc_pool memory object cache.
+//
+// The alloc_pool module will register a name and descriptor for each memory object maintained by the pool during
+// initialization. This information can be used to gather the metrics associated with the given object. The alloc_pool
+// will deregister these on shutdown. See qd_http_add/remove_alloc_metric().
+//
+typedef struct allocator_metric_definition_t allocator_metric_definition_t;
+struct allocator_metric_definition_t {
+    DEQ_LINKS(allocator_metric_definition_t);
+    const char *name;
+    const qd_alloc_type_desc_t *desc;
 };
-static size_t allocator_metrics_length = sizeof(allocator_metrics)/sizeof(allocator_metrics[0]);
+DEQ_DECLARE(allocator_metric_definition_t, allocator_metric_definition_list_t);
+static allocator_metric_definition_list_t allocator_metrics = DEQ_EMPTY;
 
-#define ALLOC_DATA(S, F) ((allocator_field){#F, S.F})
+// Write a single metric to the output buffer. Advance (*start) past the written data (to the null terminator) and
+// return the total octets written (not including null terminator). Return zero on error (abort() if debug build).
+//
+static size_t _write_metric(uint8_t **start, size_t available, const char *name, const char *type, uint64_t value)
+{
+    // if you modify this please update any buffer sizing info above
 
-typedef struct allocator_field {
-    const char* name;
-    uint64_t value;
-} allocator_field;
+    int rc1 = snprintf((char *) *start, available, "# TYPE %s %s\n", name, type);
+    if (rc1 < 0 || rc1 >= available) { // overrun!
+        assert(false);  // you need to increase the output_buffer size!
+        return 0;
+    }
+    *start += rc1;
+    available -= rc1;
 
-static bool write_stats(uint8_t **position, const uint8_t * const end, const char* name, const char* type, int value)
+    int rc2 = snprintf((char *) *start, available, "%s %" PRIu64 "\n", name, value);
+    if (rc2 < 0 || rc2 >= available) { // overrun!
+        assert(false);  // you need to increase the output_buffer size!
+        return 0;
+    }
+    *start += rc2;
+
+    return rc1 + rc2;
+}
+
+// Write all the router global metrics to the output buffer. Return the total octets written (not including null
+// terminator) or zero on error.
+//
+// On successful return (*start) will be advanced to the terminating null byte.
+//
+static size_t _write_global_metrics(const stats_request_state_t *state, uint8_t **start, size_t available)
 {
-    //11 chars + type + 2*name + 20 chars for int
-    // average metric name size is 30 bytes
-    // average metric type size is 8 bytes
-    // current number of metrics is 22
-    // total metric buffer size = 22 * (11 + 8 + 2*30 + 20) = 2178
-    size_t length = 11 + strlen(type) + strlen(name)*2 + 20;
-    if (end - *position >= length) {
-        *position += lws_snprintf((char*) *position, end - *position, "# TYPE %s %s\n", name, type);
-        *position += lws_snprintf((char*) *position, end - *position, "%s %i\n", name, value);
-        return true;
-    } else {
-        return false;
+    assert(state && state->callback_completed);
+
+    const size_t save = available;
+
+    for (int index = 0; index < metrics_length; ++index) {
+        const metric_definition *metric = &metrics[index];
+        size_t rc = _write_metric(start, available, metric->name, metric->type, metric->get_value(&state->stats));
+        if (rc == 0) {
+            return 0;  // error writing, close the connection
+        }
+        available -= rc;
     }
+
+    return save - available;
 }
 
-static bool write_allocator_stats(uint8_t **position, const uint8_t * const end, const char* name, allocator_field field)
+
+// Write a single allocator metric to the output buffer. Generate the metric name using the name and subname. Return the
+// total octets written (not including null terminator) or zero on error.
+//
+// On successful return (*start) will be advanced to the terminating null byte.
+//
+static size_t _write_allocator_metric(uint8_t **start, size_t available, const char *name, const char *subname, uint64_t value)
 {
-    // 30 chars (static) + 2*name + 2*field.name + 20 for int
-    // average allocator metric name size is 54 bytes (name:field.name)
-    // current number of metrics is 180
-    // total allocator buffer size = 180 * (30 + 2*54 + 20) = 28440
-    size_t length = 30 + strlen(name)*2 + strlen(field.name)*2 + 20;
-    if (end - *position >= length) {
-        *position += lws_snprintf((char*) *position, end - *position, "# TYPE %s:%s_bytes gauge\n", name, field.name);
-        *position += lws_snprintf((char*) *position, end - *position, "%s:%s_bytes %"PRIu64"\n", name, field.name, field.value);
-        return true;
-    } else {
-        return false;
+    char name_buffer[MAX_METRIC_NAME_LEN + 1];
+    int rc = snprintf(name_buffer, sizeof(name_buffer), "%s:%s", name, subname);
+    if (rc < 0 || rc >= sizeof(name_buffer)) {  // overrun!
+        assert(false);  // you need to increase the output_buffer size!
+        return 0;
     }
+
+    return _write_metric(start, available, name_buffer, "gauge", value);
 }
 
-static bool write_metric(uint8_t **position, const uint8_t * const end, metric_definition* definition, qdr_global_stats_t* stats)
+// Write all the allocator metrics to the output buffer. Return the total octets written (not including null terminator)
+// or zero on error.
+//
+// On successful return (*start) will be advanced to the terminating null byte.
+//
+static size_t _write_allocator_metrics(uint8_t **start, size_t available)
 {
-    return write_stats(position, end, definition->name, definition->type, definition->value(stats));
+    const size_t save = available;
+    uint64_t pool_total_bytes = 0;  // total memory allocated across all types
+
+    allocator_metric_definition_t *metric = DEQ_HEAD(allocator_metrics);
+    assert(metric);  // unexpected if null no metrics?
+
+    while (metric) {
+        qd_alloc_stats_t stats = qd_alloc_desc_stats(metric->desc);
+        uint64_t total_allocated = stats.total_alloc_from_heap - stats.total_free_to_heap;
+        uint64_t total_in_use = stats.held_by_threads;
+        uint64_t total_in_cache = total_allocated - total_in_use;
+        uint64_t total_bytes = total_allocated * qd_alloc_type_size(metric->desc);
+
+        pool_total_bytes += total_bytes;
+
+        size_t rc = _write_allocator_metric(start, available, metric->name, "total_allocated", total_allocated);
+        if (rc == 0) return 0;
+        available -= rc;
+
+        rc = _write_allocator_metric(start, available, metric->name, "total_in_use", total_in_use);
+        if (rc == 0) return 0;
+        available -= rc;
+
+        rc = _write_allocator_metric(start, available, metric->name, "total_in_cache", total_in_cache);
+        if (rc == 0) return 0;
+        available -= rc;
+
+        rc = _write_allocator_metric(start, available, metric->name, "total_bytes", total_bytes);
+        if (rc == 0) return 0;
+        available -= rc;
+
+        metric = DEQ_NEXT(metric);
+    }
+
+    size_t rc = _write_metric(start, available, "alloc_pool_total_bytes", "gauge", pool_total_bytes);
+    if (rc == 0) return 0;
+    available -= rc;
+
+    return save - available;
 }
 
-static bool write_allocator_metric(uint8_t **position, const uint8_t * const end, allocator_metric_definition* definition)
+// Gather the current metrics and write them to the output buffer. Return the total bytes written to the buffer (not
+// including null terminator) or zero on error.
+//
+// On successful return *start is advanced to the terminating null byte
+//
+static size_t _generate_metrics_response(stats_request_state_t *state, uint8_t **start, const uint8_t * const end)
 {
-    qd_alloc_stats_t allocator_stats = definition->fn();
-    if (!write_allocator_stats(position, end, definition->name, ALLOC_DATA(allocator_stats, total_alloc_from_heap))) return false;
-    if (!write_allocator_stats(position, end, definition->name, ALLOC_DATA(allocator_stats, total_free_to_heap))) return false;
-    if (!write_allocator_stats(position, end, definition->name, ALLOC_DATA(allocator_stats, held_by_threads))) return false;
-    if (!write_allocator_stats(position, end, definition->name, ALLOC_DATA(allocator_stats, batches_rebalanced_to_threads))) return false;
-    if (!write_allocator_stats(position, end, definition->name, ALLOC_DATA(allocator_stats, batches_rebalanced_to_global))) return false;
-    return true;
+    if (_write_global_metrics(state, start, end - *start) == 0
+        || _write_allocator_metrics(start, end - *start) == 0) {
+        // error, close the connection
+        return 0;
+    }
+
+    return end - *start;
 }
 
 static int add_header_by_name(struct lws *wsi, const char* name, const char* value, uint8_t** position, uint8_t* end)
@@ -673,77 +732,106 @@ static int add_header_by_name(struct lws *wsi, const char* name, const char* val
 }
 
 static int callback_metrics(struct lws *wsi, enum lws_callback_reasons reason,
-                               void *user, void *in, size_t len)
+                            void *user, void *in, size_t len)
 {
     qd_http_server_t *hs = wsi_server(wsi);
     stats_t *stats = (stats_t*) user;
-    // rationale for buffer size is explained at write_stats and write_allocator_stats
-    uint8_t buffer[LWS_PRE + 30618];
-    uint8_t *start = &buffer[LWS_PRE], *position = start, *end = &buffer[sizeof(buffer) - LWS_PRE - 1];
+
+    if (!stats)   // ignore any non-http request events
+        return 0;
 
     switch (reason) {
 
     case LWS_CALLBACK_HTTP: {
-        stats->context = NEW(stats_request_state_t);
-        ZERO(stats->context);
-        stats->context->wsi = wsi;
-        stats->context->server = hs;
+        // New HTTP request received, setup per-request state with output buffer
+        assert(!stats->state);
+        // see the comments above regarding output buffer size for metrics:
+        size_t buf_size = HTTP_HEADER_LEN
+            // router global metrics:
+            + (metrics_length * PER_METRIC_BUF_SIZE)
+            // alloc_pool metrics (+ 1 for alloc_pool_total_bytes):
+            + (DEQ_SIZE(allocator_metrics) * PER_METRIC_BUF_SIZE * PER_ALLOC_METRIC_COUNT)
+            + PER_METRIC_BUF_SIZE
+            // 1 terminating null
+            + 1;
+        stats->state = new_stats_request_state(buf_size);
+        stats->state->wsi = wsi;
+        stats->state->server = hs;
         //request stats from core thread
-        qdr_request_global_stats(hs->core, &stats->context->stats, handle_stats_results, (void*) stats->context);
+        qdr_request_global_stats(hs->core, &stats->state->stats, handle_stats_results, (void*) stats->state);
         return 0;
     }
 
     case LWS_CALLBACK_HTTP_WRITEABLE: {
-        //encode stats into buffer
-        if (!stats->headers_sent) {
-            if (lws_add_http_header_status(wsi, HTTP_STATUS_OK, &position, end)
-                || add_header_by_name(wsi, "content-type:", "text/plain", &position, end)
-                || add_header_by_name(wsi, "connection:", "close", &position, end))
-                return 1;
-            if (lws_finalize_http_header(wsi, &position, end))
-                return 1;
-            stats->headers_sent = true;
+        // LWS HTTP server ready to send to HTTP response data
+        assert(stats->state);  // expect LWS_CALLBACK_HTTP event occurs first!
+
+        if (stats->response_complete) {  // ignore spurious WRITABLE events once response complete
+            return 0;
         }
 
-        while (stats->current < metrics_length) {
-            if (write_metric(&position, end, &metrics[stats->current], &stats->context->stats)) {
-                stats->current++;
-                qd_log(LOG_HTTP, QD_LOG_DEBUG, "wrote metric %lu of %lu", stats->current, metrics_length);
-            } else {
-                qd_log(LOG_HTTP, QD_LOG_WARNING, "insufficient space in buffer");
-                break;
-            }
+        if (!stats->state->callback_completed) {
+            // the asynchronous request for global metrics has not yet completed. When it does another
+            // LWS_CALLBACK_HTTP_WRITABLE event will be generated and then we can send the response.
+            return 0;
         }
 
-        int alloc_cur = 0;
-        while (alloc_cur < allocator_metrics_length) {
-            if (write_allocator_metric(&position, end, &allocator_metrics[alloc_cur])) {
-                qd_log(LOG_HTTP, QD_LOG_DEBUG, "wrote allocator metric %i of %lu", alloc_cur,
-                       allocator_metrics_length);
-                alloc_cur++;
-            } else {
-                qd_log(LOG_HTTP, QD_LOG_WARNING, "insufficient space in buffer");
-                break;
-            }
+        uint8_t *start = &stats->state->lws_prefix[LWS_PRE];
+        uint8_t *end = start + stats->state->buffer_size;  // first byte past buffer
+
+        // encode stats into buffer
+
+        if (lws_add_http_header_status(wsi, HTTP_STATUS_OK, &start, end)
+            || add_header_by_name(wsi, "content-type:", "text/plain", &start, end)
+            || add_header_by_name(wsi, "connection:", "close", &start, end)
+            || lws_finalize_http_header(wsi, &start, end)) {
+
+            qd_log(LOG_HTTP, QD_LOG_WARNING, "Metrics request failed: cannot send headers");
+            return 1;
+        }
+
+        // if this fails make HTTP_HEADER_LEN larger (LWS does not document the required size)
+        assert(HTTP_HEADER_LEN >= (start - &stats->state->lws_prefix[LWS_PRE]));
+
+        if (_generate_metrics_response(stats->state, &start, end) == 0) {
+            // Failed to generate output. This is not expected. Terminate the connection
+            qd_log(LOG_HTTP, QD_LOG_WARNING, "Metrics request failed: cannot access metrics");
+            return 1;
         }
-        int n = (stats->current < metrics_length) || (alloc_cur < allocator_metrics_length) ? LWS_WRITE_HTTP : LWS_WRITE_HTTP_FINAL;
 
-        //write buffer
-        size_t available = position - start;
-        if (lws_write(wsi, (unsigned char*) start, available, n) != available)
+        // Write the entire output buffer to LWS in one call. Best I can tell from the docs this should not fail
+        // unless the connection has closed.
+
+        size_t available = (size_t) (start - &stats->state->lws_prefix[LWS_PRE]);
+        size_t amount = lws_write(wsi, (unsigned char *) &stats->state->lws_prefix[LWS_PRE],
+                                  available, LWS_WRITE_HTTP_FINAL);
+
+        if (amount < available) {
+            // according to the lws_write header, this is an error. It may return more than available, which is ok
+            qd_log(LOG_HTTP, QD_LOG_WARNING, "Metrics request failed: connection closed while writing");
+            return 1;
+        }
+
+        stats->response_complete = true;
+
+        if (lws_http_transaction_completed(wsi)) {
+            // I do not think this is an error, but according to the examples we close the connection when this happens
             return 1;
-        if (n == LWS_WRITE_HTTP_FINAL) {
-            if (lws_http_transaction_completed(wsi)) return -1;
-        } else {
-            lws_callback_on_writable(wsi);
         }
         return 0;
     }
 
+    case LWS_CALLBACK_HTTP_DROP_PROTOCOL:
     case LWS_CALLBACK_CLOSED_HTTP: {
-        stats->context->wsi_deleted = true;
-        if (stats->context->callback_completed) {
-            free(stats->context);
+        // request complete (added DROP_PROTOCOL since we do not get CLOSED_HTTP from curl clients (?))
+        if (stats->state) {
+            stats->state->wsi_deleted = true;
+            // if the callback is still running then we cannot free the state since the callback will access it. We rely
+            // on the callback to free the state in this case. See handle_stats_result_HT().
+            if (stats->state->callback_completed) {
+                free_stats_request_state(stats->state);
+                stats->state = 0;
+            }
         }
         return 0;
     }
@@ -758,50 +846,83 @@ static int callback_healthz(struct lws *wsi, enum lws_callback_reasons reason,
 {
     qd_http_server_t *hs = wsi_server(wsi);
     stats_t *stats = (stats_t*) user;
-    uint8_t buffer[LWS_PRE + 2048];
-    uint8_t *start = &buffer[LWS_PRE], *position = start, *end = &buffer[sizeof(buffer) - LWS_PRE - 1];
+
+    if (!stats)   // ignore any non-http request events
+        return 0;
 
     switch (reason) {
 
     case LWS_CALLBACK_HTTP: {
-        stats->context = NEW(stats_request_state_t);
-        ZERO(stats->context);
-        stats->context->wsi = wsi;
-        stats->context->server = hs;
+        assert(!stats->state);
+        stats->state = new_stats_request_state(HEALTHZ_BUF_SIZE);
+        stats->state->wsi = wsi;
+        stats->state->server = hs;
         //make dummy request for stats (pass in null ptr); this still exercises the
         //path through core thread and back through callback on io thread which is
         //a reasonable initial liveness check
-        qdr_request_global_stats(hs->core, 0, handle_stats_results, (void*) stats->context);
+        qdr_request_global_stats(hs->core, 0, handle_stats_results, (void*) stats->state);
         return 0;
     }
 
     case LWS_CALLBACK_HTTP_WRITEABLE: {
-        //encode stats into buffer
-        if (!stats->headers_sent) {
-            if (lws_add_http_header_status(wsi, HTTP_STATUS_OK, &position, end)
-                || add_header_by_name(wsi, "content-type:", "text/plain", &position, end)
-                || lws_add_http_header_content_length(wsi, 3, &position, end))
-                return 1;
-            if (lws_finalize_http_header(wsi, &position, end))
-                return 1;
-            stats->headers_sent = true;
+        assert(stats->state);  // expect LWS_CALLBACK_HTTP event occurs first!
+
+        if (stats->response_complete) {  // ignore spurious WRITABLE events once response complete
+            return 0;
+        }
+
+        if (!stats->state->callback_completed) {
+            // the asynchronous request for global metrics has not yet completed. When it does another
+            // LWS_CALLBACK_HTTP_WRITABLE event will be generated and then we can send the response.
+            return 0;
+        }
+
+        uint8_t *start = &stats->state->lws_prefix[LWS_PRE];
+        uint8_t *end = start + HEALTHZ_BUF_SIZE;  // first byte past buffer
+
+        // encode stats into buffer
+
+        if (lws_add_http_header_status(wsi, HTTP_STATUS_OK, &start, end)
+            || add_header_by_name(wsi, "content-type:", "text/plain", &start, end)
+            || lws_add_http_header_content_length(wsi, 3, &start, end)
+            || lws_finalize_http_header(wsi, &start, end)) {
+
+            qd_log(LOG_HTTP, QD_LOG_WARNING, "Healthz request failed: cannot send headers");
+            return 1;
         }
-        position += lws_snprintf((char*) position, end - position, "OK\n");
 
-        int n = LWS_WRITE_HTTP_FINAL;
-        //write buffer
-        size_t available = position - start;
-	if (lws_write(wsi, (unsigned char*) start, available, n) != available)
+        // if this fails make HTTP_HEADER_LEN larger (LWS does not document the required size)
+        assert(HTTP_HEADER_LEN >= (start - &stats->state->lws_prefix[LWS_PRE]));
+
+        start += lws_snprintf((char*) start, end - start, "OK\n");
+
+        size_t available = (size_t) (start - &stats->state->lws_prefix[LWS_PRE]);
+        size_t amount = lws_write(wsi, (unsigned char *) &stats->state->lws_prefix[LWS_PRE],
+                                  available, LWS_WRITE_HTTP_FINAL);
+        if (amount < available) {
+            // according to the lws_write header, this is an error. It may return more than available, which is ok
+            qd_log(LOG_HTTP, QD_LOG_WARNING, "Healthz request failed: connection closed while writing");
+            return 1;
+        }
+
+        stats->response_complete = true;
+
+        if (lws_http_transaction_completed(wsi)) {
+            // I do not think this is an error, but according to the examples we close the connection when this happens
             return 1;
-        else if (lws_http_transaction_completed(wsi))
-            return -1;
-        else return 0;
+        }
+
+        return 0;
     }
 
+    case LWS_CALLBACK_HTTP_DROP_PROTOCOL:  // won't get CLOSED_HTTP from curl (?)
     case LWS_CALLBACK_CLOSED_HTTP: {
-        stats->context->wsi_deleted = true;
-        if (stats->context->callback_completed) {
-            free(stats->context);
+        if (stats->state) {
+            stats->state->wsi_deleted = true;
+            if (stats->state->callback_completed) {
+                free_stats_request_state(stats->state);
+                stats->state = 0;
+            }
         }
         return 0;
     }
@@ -916,10 +1037,20 @@ static int callback_amqpws(struct lws *wsi, enum lws_callback_reasons reason,
 
 #define DEFAULT_TICK 1000
 
-static void* http_thread_run(void* v) {
+#ifndef NDEBUG
+static int threads_running;
+#endif
+
+static void* http_thread_run(void* v)
+{
     qd_http_server_t *hs = v;
     qd_log(LOG_HTTP, QD_LOG_INFO, "HTTP server thread running");
     int result = 0;
+
+#ifndef NDEBUG
+    ++threads_running;
+#endif
+
     while(result >= 0) {
         /* Send a USER event to run transport ticks, may decrease hs->next_tick. */
         hs->now = qd_timer_now();
@@ -956,6 +1087,11 @@ static void* http_thread_run(void* v) {
             }
         }
     }
+
+#ifndef NDEBUG
+    --threads_running;
+#endif
+
     qd_log(LOG_HTTP, QD_LOG_INFO, "HTTP server thread exit");
     return NULL;
 }
@@ -1026,6 +1162,7 @@ qd_lws_listener_t *qd_http_server_listen(qd_http_server_t *hs, qd_listener_t *li
         work_t w = { W_LISTEN, hl };
         work_push(hs, w);
     }
+
     return hl;
 }
 
@@ -1048,3 +1185,53 @@ static qd_lws_listener_t *wsi_listener(struct lws *wsi) {
     }
     return hl;
 }
+
+void qd_http_add_alloc_metric(const char *name, const qd_alloc_type_desc_t *desc)
+{
+    allocator_metric_definition_t *md = qd_malloc(sizeof(allocator_metric_definition_t));
+    ZERO(md);
+    DEQ_ITEM_INIT(md);
+    // name and desc remain valid until qd_http_remove_alloc_metric() is called
+    md->name = name;
+    md->desc = desc;
+    DEQ_INSERT_TAIL(allocator_metrics, md);
+
+#ifdef NDEBUG
+    // Attempting to add a metric after the server threads have started will crash stuff. If you hit this assert then
+    // qd_alloc_initialize() has not been called. qd_alloc_initialize() MUST be called before starting the http threads!
+    assert(threads_running == 0);
+#endif
+}
+
+void qd_http_remove_alloc_metric(const char *name)
+{
+    allocator_metric_definition_t *md = DEQ_HEAD(allocator_metrics);
+    DEQ_FIND(md, strcmp(md->name, name) == 0);
+    if (md) {
+        DEQ_REMOVE(allocator_metrics, md);
+        free(md);
+    }
+
+#ifndef NDEBUG
+    // Attempting to remove a metric while the server threads are running will crash stuff. If you hit this assert then
+    // qd_alloc_finalize() has been called prior to stopping all http threads. qd_alloc_finalize() MUST NOT be called
+    // while http threads are running!
+    assert(threads_running == 0);
+#endif
+}
+
+// allocate a new stats_request_state_t instance, include buffer_size additional octets past the structure for rendering
+// the HTML response
+//
+static stats_request_state_t *new_stats_request_state(size_t buffer_size)
+{
+    stats_request_state_t *state = qd_malloc(sizeof(stats_request_state_t) + buffer_size);
+    ZERO(state);  // do not bother initializing buffer space - it will be overwritten
+    state->buffer_size = buffer_size;
+    return state;
+}
+
+static void free_stats_request_state(stats_request_state_t *state)
+{
+    free(state);
+}
diff --git a/src/http.h b/src/http.h
index fffa2f671..771396345 100644
--- a/src/http.h
+++ b/src/http.h
@@ -45,4 +45,9 @@ qd_lws_listener_t *qd_http_server_listen(qd_http_server_t *s, struct qd_listener
  */
 void qd_lws_listener_close(qd_lws_listener_t *hl);
 
+/* register/deregister alloc_pool metrics (thread safe) */
+struct qd_alloc_type_desc_t;
+void qd_http_add_alloc_metric(const char *name, const struct qd_alloc_type_desc_t *desc);
+void qd_http_remove_alloc_metric(const char *name);
+
 #endif // QD_HTTP_H
diff --git a/tests/system_tests_http.py b/tests/system_tests_http.py
index 4f3afac2d..38aad2e99 100644
--- a/tests/system_tests_http.py
+++ b/tests/system_tests_http.py
@@ -20,16 +20,20 @@
 import os
 import threading
 import ssl
-from subprocess import PIPE, STDOUT
 
 from urllib.request import urlopen, build_opener, HTTPSHandler
 from urllib.error import HTTPError, URLError
 
 import skupper_router_site
-from system_test import TIMEOUT, Process, QdManager, retry
+from system_test import Process, QdManager, retry
 from system_test import TestCase, Qdrouterd, main_module, DIR
 from system_test import unittest
 
+#
+# Note: these tests exercise the management interface accessed via HTTP. These
+# tests have nothing to do with the HTTP adaptors!
+#
+
 
 class RouterTestHttp(TestCase):
 
@@ -61,18 +65,6 @@ def get_cert(cls, url):
         opener = build_opener(HTTPSHandler(context=context))
         return opener.open(url).read().decode('utf-8')
 
-    def run_skmanage(self, cmd, input=None, expect=Process.EXIT_OK, address=None):
-        p = self.popen(
-            ['skmanage'] + cmd.split(' ') + ['--bus', address or self.address(), '--indent=-1', '--timeout', str(TIMEOUT)],
-            stdin=PIPE, stdout=PIPE, stderr=STDOUT, expect=expect,
-            universal_newlines=True)
-        out = p.communicate(input)[0]
-        try:
-            p.teardown()
-        except Exception as e:
-            raise Exception(out if out else str(e))
-        return out
-
     def assert_get(self, url):
         self.assertEqual('HTTP test\n', self.get("%s/system_tests_http.txt" % url))
 
@@ -202,26 +194,62 @@ def run(self):
         self.assertRaises(URLError, urlopen, "https://localhost:%d/nosuch" % r.ports[0])
 
     def test_http_metrics(self):
+        """ Verify the prometheus metrics provided by the router """
+        metrics_ports = [self.get_port(), self.get_port()]
         config = Qdrouterd.Config([
             ('router', {'id': 'QDR.METRICS'}),
-            ('listener', {'port': self.get_port(), 'http': 'yes'}),
-            ('listener', {'port': self.get_port(), 'httpRootDir': os.path.dirname(__file__)}),
+            ('listener', {'role': 'normal', 'port': self.get_port()}),
+            ('listener', {'port': metrics_ports[0], 'http': 'yes'}),
+            ('listener', {'port': metrics_ports[1], 'httpRootDir': os.path.dirname(__file__)}),
         ])
         r = self.qdrouterd('metrics-test-router', config)
 
-        def test(port):
-            result = urlopen("http://localhost:%d/metrics" % port, cafile=self.ssl_file('ca-certificate.pem'))
-            self.assertEqual(200, result.getcode())
-            data = result.read().decode('utf-8')
-            assert 'connections' in data
-            assert 'deliveries_ingress' in data
-            assert 'deliveries_delayed_1sec' in data
-            assert 'deliveries_delayed_10sec' in data
-            assert 'deliveries_redirected_to_fallback' in data
+        # generate a list of all metric names expected to be provided via HTTP:
+
+        stat_names = ["qdr_connections_total", "qdr_links_total",
+                      "qdr_addresses_total", "qdr_routers_total",
+                      "qdr_auto_links_total",
+                      "qdr_presettled_deliveries_total",
+                      "qdr_dropped_presettled_deliveries_total",
+                      "qdr_accepted_deliveries_total",
+                      "qdr_released_deliveries_total",
+                      "qdr_rejected_deliveries_total",
+                      "qdr_modified_deliveries_total",
+                      "qdr_deliveries_ingress_total",
+                      "qdr_deliveries_egress_total",
+                      "qdr_deliveries_transit_total",
+                      "qdr_deliveries_ingress_route_container_total",
+                      "qdr_deliveries_egress_route_container_total",
+                      "qdr_deliveries_delayed_1sec_total",
+                      "qdr_deliveries_delayed_10sec_total",
+                      "qdr_deliveries_stuck_total",
+                      "qdr_links_blocked_total",
+                      "qdr_deliveries_redirected_to_fallback_total"]
+        for stat in r.management.query(type="io.skupper.router.allocator").get_dicts():
+            stat_names.append(stat['typeName'])
+
+        def _test(stat_names, port):
+            # sanity check that all expected stats are reported
+            resp = urlopen(f"http://localhost:{port}/metrics", cafile=self.ssl_file('ca-certificate.pem'))
+            self.assertEqual(200, resp.getcode())
+            metrics = [x for x in resp.read().decode('utf-8').splitlines() if not x.startswith("#")]
+
+            # Verify that all expected stats are reported by the metrics URL
+
+            for name in stat_names:
+                found = False
+                for metric in metrics:
+                    # remove the counter and strip the allocator name suffix
+                    # (if present)
+                    mname = metric.strip().split()[0].split(':')[0]
+                    if mname == name:
+                        found = True
+                        break
+                self.assertTrue(found, f"Did not find {name} in returned metrics!")
 
         # Sequential calls on multiple ports
-        for port in r.ports:
-            test(port)
+        for port in metrics_ports:
+            _test(stat_names, port)
 
         # Concurrent calls on multiple ports
         class TestThread(threading.Thread):
@@ -232,10 +260,11 @@ def __init__(self, port):
 
             def run(self):
                 try:
-                    test(self.port)
+                    _test(stat_names, self.port)
                 except Exception as e:
                     self.ex = e
-        threads = [TestThread(p) for p in r.ports + r.ports]
+
+        threads = [TestThread(p) for p in metrics_ports * 4]
         for t in threads:
             t.join()
         for t in threads: