Aiven-Open · libretto · Jun 9, 2023 · Jun 9, 2023 · Jun 10, 2023 · Jun 10, 2023
@@ -461,6 +461,15 @@ Keys to take special care are the ones needed to configure Kafka and advertised_
    * - ``master_election_strategy``
      - ``lowest``
      - Decides on what basis the Karapace cluster master is chosen (only relevant in a multi node setup)
+   * - ``metrics_mode``
+     - ``statsd``
+     - Statistics server mode. For karapace supports ststsd server
+   * - ``statsd_uri``
+     - ``127.0.0.1:8125``
+     - Host:Port of statsd server
+   * - ``metrics_extended``
+     - ``true``
+     - Enable extended metrics. Extended metrics: connections_active, request_size_avg, request_size_max, response_size_avg, response_size_max
 
 
 Authentication and authorization of Karapace Schema Registry REST API

@@ -27,5 +27,8 @@
     "registry_authfile": null,
     "topic_name": "_schemas",
     "protobuf_runtime_directory": "runtime",
-    "session_timeout_ms": 10000
+    "session_timeout_ms": 10000,
+    "metrics_mode": "statsd",
+    "statsd_uri": "127.0.0.1:8125",
+    "metrics_extended": true
 }
@@ -143,6 +143,9 @@ class ConfigDefaults(Config, total=False):
     "karapace_registry": False,
     "master_election_strategy": "lowest",
     "protobuf_runtime_directory": "runtime",
+    "metrics_mode": "statsd",
+    "statsd_uri": "127.0.0.1:8125",
+    "metrics_extended": True,
 }
 SECRET_CONFIG_OPTIONS = [SASL_PLAIN_PASSWORD]
 

diff --git a/karapace/karapacemetrics.py b/karapace/karapacemetrics.py
@@ -0,0 +1,133 @@
+"""
+karapace - metrics
+Supports collection of system metrics
+list of supported metrics:
+connections-active - The number of active HTTP(S) connections to server.
+                     Data collected inside aiohttp request handler.
+
+Copyright (c) 2023 Aiven Ltd
+See LICENSE for details
+"""
+from __future__ import annotations
+
+from kafka.metrics import Metrics
+from karapace.config import Config
+from karapace.statsd import StatsClient
+
+import os
+import psutil
+import schedule
+import threading
+import time
+
+
+class Singleton(type):
+    _instance: Singleton | None = None
+
+    def __call__(cls, *args: str, **kwargs: int) -> Singleton:
+        if cls._instance is None:
+            instance = super().__call__(*args, **kwargs)
+            cls._instance = instance
+        return cls._instance
+
+
+class KarapaceMetrics(metaclass=Singleton):
+    def __init__(self) -> None:
+        self.active: object | None = None
+        self.stats_client: StatsClient | None = None
+        self.is_ready = False
+        self.metrics = Metrics()
+        self.stop_event = threading.Event()
+        self.worker_thread = threading.Thread(target=self.worker)
+        self.lock = threading.Lock()
+        self.error_count = 0
+        self.app_host = ""
+        self.app_port = 8081
+
+    def setup(self, stats_client: StatsClient, config: Config) -> None:
+        self.active = config.get("metrics_extended")
+        if not self.active:
+            return
+        with self.lock:
+            if self.is_ready:
+                return
+            self.is_ready = True
+        if not self.stats_client:
+            self.stats_client = stats_client
+        else:
+            self.active = False
+            return
+        app_host = config.get("host")
+        app_port = config.get("port")
+        if app_host and app_port:
+            self.app_host = app_host
+            self.app_port = app_port
+        else:
+            raise RuntimeError("No application host or port defined in application")
+
+        schedule.every(10).seconds.do(self.connections)
+        self.worker_thread.start()
+
+    def request(self, size: int) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        self.stats_client.gauge("request-size", size)
+
+    def response(self, size: int) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        self.stats_client.gauge("response-size", size)
+
+    def are_we_master(self, is_master: bool) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        self.stats_client.gauge("master-slave-role", int(is_master))
+
+    def latency(self, latency_ms: float) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        self.stats_client.gauge("master-slave-role", latency_ms)
+
+    def error(self) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        self.error_count += 1
+        self.stats_client.gauge("error", self.error_count)
+
+    def connections(self) -> None:
+        if not self.active:
+            return
+        if not isinstance(self.stats_client, StatsClient):
+            raise RuntimeError("no StatsClient available")
+        psutil.Process(os.getpid()).connections()
+        connections = 0
+        for conn in psutil.net_connections(kind="tcp"):
+            if not conn.laddr:
+                continue
+            if conn.laddr[0] == self.app_host and conn.laddr[1] == self.app_port and conn.status == "ESTABLISHED":
+                connections += 1
+        self.stats_client.gauge("connections-active", connections)
+
+    def worker(self) -> None:
+        while True:
+            if self.stop_event.is_set():
+                break
+            schedule.run_pending()
+            time.sleep(1)
+
+    def cleanup(self) -> None:
+        if not self.active:
+            return
+        self.stop_event.set()
+        if self.worker_thread.is_alive():
+            self.worker_thread.join()
@@ -9,6 +9,7 @@
 from accept_types import get_best_match
 from http import HTTPStatus
 from karapace.config import Config, create_server_ssl_context
+from karapace.karapacemetrics import KarapaceMetrics
 from karapace.statsd import StatsClient
 from karapace.utils import json_decode, json_encode
 from karapace.version import __version__
@@ -134,6 +135,8 @@ def __init__(
         if content_type:
             self.headers["Content-Type"] = content_type
         super().__init__(f"HTTPResponse {status.value}")
+        if not is_success(status):
+            KarapaceMetrics().error()
 
     def ok(self) -> bool:
         """True if resposne has a 2xx status_code"""
@@ -169,6 +172,7 @@ def __init__(
         self.stats = StatsClient(config=config)
         self.app.on_cleanup.append(self.close_by_app)
         self.not_ready_handler = not_ready_handler
+        KarapaceMetrics().setup(self.stats, config)
 
     def _create_aiohttp_application(self, *, config: Config) -> aiohttp.web.Application:
         return aiohttp.web.Application(client_max_size=config["http_request_max_size"])
@@ -183,6 +187,7 @@ async def close(self) -> None:
         set as hook because the awaitables have to run inside the event loop
         created by the aiohttp library.
         """
+        KarapaceMetrics().cleanup()
         self.stats.close()
 
     @staticmethod
@@ -269,15 +274,22 @@ async def _handle_request(
             url=request.url,
             path_for_stats=path_for_stats,
         )
+
         try:
             if request.method == "OPTIONS":
+                # self.metrics.request(0)
                 origin = request.headers.get("Origin")
                 if not origin:
                     raise HTTPResponse(body="OPTIONS missing Origin", status=HTTPStatus.BAD_REQUEST)
                 headers = self.cors_and_server_headers_for_request(request=rapu_request, origin=origin)
+
                 raise HTTPResponse(body=b"", status=HTTPStatus.OK, headers=headers)
 
             body = await request.read()
+            if body:
+                KarapaceMetrics().request(len(body))
+            else:
+                KarapaceMetrics().request(0)
             if json_request:
                 if not body:
                     raise HTTPResponse(body="Missing request JSON body", status=HTTPStatus.BAD_REQUEST)
@@ -385,6 +397,7 @@ async def _handle_request(
             )
             headers = {"Content-Type": "application/json"}
             resp = aiohttp.web.Response(body=body, status=status.value, headers=headers)
+
         except asyncio.CancelledError:
             self.log.debug("Client closed connection")
             raise
@@ -393,6 +406,8 @@ async def _handle_request(
             self.log.exception("Unexpected error handling user request: %s %s", request.method, request.url)
             resp = aiohttp.web.Response(text="Internal Server Error", status=HTTPStatus.INTERNAL_SERVER_ERROR.value)
         finally:
+            KarapaceMetrics().response(resp.content_length)
+            KarapaceMetrics().latency((time.monotonic() - start_time) * 1000)
             self.stats.timing(
                 self.app_request_metric,
                 time.monotonic() - start_time,

@@ -22,6 +22,7 @@
     VersionNotFoundException,
 )
 from karapace.in_memory_database import InMemoryDatabase
+from karapace.karapacemetrics import KarapaceMetrics
 from karapace.key_format import KeyFormatter
 from karapace.master_coordinator import MasterCoordinator
 from karapace.messaging import KarapaceProducer
@@ -123,6 +124,7 @@ async def get_master(self, ignore_readiness: bool = False) -> tuple[bool, str |
                 elif not ignore_readiness and self.schema_reader.ready is False:
                     LOG.info("Schema reader isn't ready yet: %r", self.schema_reader.ready)
                 else:
+                    KarapaceMetrics().are_we_master(are_we_master)
                     return are_we_master, master_url
                 await asyncio.sleep(1.0)
 

@@ -19,6 +19,7 @@
 import logging
 import socket
 import time
+import urllib
 
 STATSD_HOST: Final = "127.0.0.1"
 STATSD_PORT: Final = 8125
@@ -32,7 +33,19 @@ def __init__(
         host: str = STATSD_HOST,
         port: int = STATSD_PORT,
     ) -> None:
-        self._dest_addr: Final = (host, port)
+        _host = host
+        _port = port
+
+        if config.get("metrics_mode") == "statsd":
+            statsd_uri = config.get("statsd_uri")
+            if statsd_uri:
+                srv = urllib.parse.urlsplit("//" + str(statsd_uri))
+                if srv.hostname:
+                    _host = str(srv.hostname)
+                if srv.port:
+                    _port = int(srv.port)
+
+        self._dest_addr: Final = (_host, _port)
         self._socket: Final = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         self._tags: Final = config.get("tags", {})
         self.sentry_client: Final = get_sentry_client(sentry_config=config.get("sentry", None))

@@ -53,9 +53,9 @@ commonmark==0.9.1
     # via
     #   -r requirements.txt
     #   rich
-configargparse==1.5.3
+configargparse==1.5.5
     # via locust
-exceptiongroup==1.1.1
+exceptiongroup==1.1.2
     # via
     #   -r requirements.txt
     #   anyio
@@ -89,7 +89,7 @@ geventhttpclient==2.0.9
     # via locust
 greenlet==2.0.2
     # via gevent
-hypothesis==6.79.3
+hypothesis==6.80.0
     # via -r requirements-dev.in
 idna==3.4
     # via
@@ -150,6 +150,7 @@ protobuf==3.20.3
 psutil==5.9.5
     # via
     #   -r requirements-dev.in
+    #   -r requirements.txt
     #   locust
     #   pytest-xdist
 pygments==2.15.1
@@ -184,6 +185,8 @@ rich==12.5.1
     # via -r requirements.txt
 roundrobin==0.0.4
     # via locust
+schedule==1.2.0
+    # via -r requirements.txt
 sentry-sdk==1.26.0
     # via -r requirements-dev.in
 six==1.16.0
@@ -202,7 +205,7 @@ tenacity==8.2.2
     # via -r requirements.txt
 tomli==2.0.1
     # via pytest
-typing-extensions==4.6.3
+typing-extensions==4.7.1
     # via
     #   -r requirements.txt
     #   locust

@@ -22,7 +22,7 @@ types-cachetools==5.3.0.5
     # via -r requirements-typing.in
 types-jsonschema==4.17.0.8
     # via -r requirements-typing.in
-typing-extensions==4.6.3
+typing-extensions==4.7.1
     # via
     #   -c requirements-dev.txt
     #   mypy

@@ -11,10 +11,13 @@ tenacity<9
 typing-extensions
 ujson<6
 watchfiles<1
+schedule
+psutil
 xxhash~=3.0
 rich~=12.5.0
 cachetools==5.2.0
 
+
 # Patched dependencies
 #
 # Note: It is important to use commits to reference patched dependencies. This

@@ -30,7 +30,7 @@ charset-normalizer==3.1.0
     # via aiohttp
 commonmark==0.9.1
     # via rich
-exceptiongroup==1.1.1
+exceptiongroup==1.1.2
     # via anyio
 frozenlist==1.3.3
     # via
@@ -62,6 +62,8 @@ pkgutil-resolve-name==1.3.10
     # via jsonschema
 protobuf==3.20.3
     # via -r requirements.in
+psutil==5.9.5
+    # via -r requirements.in
 pygments==2.15.1
     # via rich
 pyrsistent==0.19.3
@@ -70,6 +72,8 @@ python-dateutil==2.8.2
     # via -r requirements.in
 rich==12.5.1
     # via -r requirements.in
+schedule==1.2.0
+    # via -r requirements.in
 six==1.16.0
     # via
     #   isodate
@@ -78,7 +82,7 @@ sniffio==1.3.0
     # via anyio
 tenacity==8.2.2
     # via -r requirements.in
-typing-extensions==4.6.3
+typing-extensions==4.7.1
     # via
     #   -r requirements.in
     #   rich