From 01c98a4e9812a3469c072d6c26f8bff8f258550d Mon Sep 17 00:00:00 2001
From: nikitagrishin <nikitagrishin1993@yandex.ru>
Date: Wed, 29 Mar 2023 17:29:27 +0300
Subject: [PATCH] #200

---
 docker-compose-webtest.yml     | 147 ++++++++++++++--
 jaeger-ui.json                 |   8 +
 otel-collector-config.yaml     |  56 ++++--
 postgres-exporter-queries.yaml | 311 +++++++++++++++++++++++++++++++++
 postgres-exporter.yaml         |   8 +
 postgres-init.sql              |   2 +
 prometheus-config.yaml         |  12 ++
 rabbitmq-enabled-plugins       |   1 +
 8 files changed, 515 insertions(+), 30 deletions(-)
 create mode 100644 jaeger-ui.json
 create mode 100644 postgres-exporter-queries.yaml
 create mode 100644 postgres-exporter.yaml
 create mode 100644 postgres-init.sql
 create mode 100644 prometheus-config.yaml
 create mode 100644 rabbitmq-enabled-plugins

diff --git a/docker-compose-webtest.yml b/docker-compose-webtest.yml
index 9204aa61..28ce18bc 100644
--- a/docker-compose-webtest.yml
+++ b/docker-compose-webtest.yml
@@ -5,43 +5,156 @@ services:
   postgres:
     image: postgres:15.1-alpine
     container_name: postgres
-    restart: unless-stopped
+    command: postgres -c shared_preload_libraries=pg_stat_statements -c pg_stat_statements.track=all -c max_connections=200
+    volumes:
+      - ./postgres-init.sql:/docker-entrypoint-initdb.d/postgres-init.sql
     environment:
       - POSTGRES_PASSWORD=Password12!
     ports:
       - '5432:5432'
+    healthcheck:
+      test: [ "CMD-SHELL", "pg_isready", "-d", "postgres", "-U", "postgres" ]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  # Postgres exporter
+  postgres-exporter:
+    image: quay.io/prometheuscommunity/postgres-exporter:v0.12.0
+    container_name: postgres-exporter
+    command:
+      - --config.file=/etc/postgres-exporter/postgres-exporter.yaml
+      - --extend.query-path=/etc/postgres-exporter/postgres-exporter-queries.yaml
+    volumes:
+      - ./postgres-exporter.yaml:/etc/postgres-exporter/postgres-exporter.yaml
+      - ./postgres-exporter-queries.yaml:/etc/postgres-exporter/postgres-exporter-queries.yaml
+    environment:
+      - DATA_SOURCE_URI=postgres:5432?sslmode=disable
+      - DATA_SOURCE_USER=postgres
+      - DATA_SOURCE_PASS=Password12!
+    ports:
+      - '9187'
+    depends_on:
+      postgres:
+        condition: service_healthy
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 512M
 
   # RabbitMQ
   rabbit:
-    image: rabbitmq:3.11.11-management-alpine
+    image: rabbitmq:3.11.11-alpine
     container_name: rabbit
-    restart: unless-stopped
+    volumes:
+      - ./rabbitmq-enabled-plugins:/etc/rabbitmq/enabled_plugins
     ports:
       - '15672:15672'
       - '5672:5672'
-
-  # Jaeger
-  jaeger:
-    image: jaegertracing/all-in-one:1.43.0
-    container_name: jaeger
+    healthcheck:
+      test: rabbitmq-diagnostics check_port_connectivity
+      interval: 5s
+      timeout: 5s
+      retries: 5
     restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  # Prometheus
+  prometheus:
+    image: quay.io/prometheus/prometheus:v2.43.0
+    container_name: prometheus
+    command:
+      - --web.console.templates=/etc/prometheus/consoles
+      - --web.console.libraries=/etc/prometheus/console_libraries
+      - --storage.tsdb.retention.time=1h
+      - --config.file=/etc/prometheus/prometheus-config.yaml
+      - --storage.tsdb.path=/prometheus
+      - --web.enable-lifecycle
+      - --web.route-prefix=/
+      - --enable-feature=exemplar-storage
+      - --enable-feature=remote-write-receiver
+    volumes:
+      - ./prometheus-config.yaml:/etc/prometheus/prometheus-config.yaml
     ports:
-      - '5778:5778'   # serve configs (sampling, etc.)
-      - '16686:16686' # serve frontend
-      - '14250:14250' # accept model.proto
+      - "9090:9090"
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 512M
 
   # OpenTelemetry collector
   otel-collector:
-    image: otel/opentelemetry-collector:0.74.0
+    image: otel/opentelemetry-collector-contrib:0.74.0
     container_name: otel-collector
-    command: [ --config=/etc/otel-collector-config.yaml ]
+    command:
+      - --config=/etc/otel-collector-config.yaml
+      - --feature-gates=service.connectors
     volumes:
       - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
     ports:
-      - '13133:13133' # health_check extension
       - '4317:4317'   # OTLP gRPC receiver
       - '4318:4318'   # OTLP http receiver
+      - "9464"        # Prometheus exporter
+      - "8888"        # metrics endpoint
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  # Jaeger
+  jaeger:
+    image: jaegertracing/all-in-one:1.43.0
+    container_name: jaeger
+    command:
+      - "--memory.max-traces"
+      - "1000"
+      - "--query.base-path"
+      - "/jaeger/ui"
+      - "--query.ui-config"
+      - "/etc/jaeger/jaeger-ui.json"
+    volumes:
+      - "./jaeger-ui.json:/etc/jaeger/jaeger-ui.json"
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+      - METRICS_STORAGE_TYPE=prometheus
+      - PROMETHEUS_SERVER_URL=http://prometheus:9090
+      - PROMETHEUS_TLS_Enabled=false
+    ports:
+      - '4317'        # accept OpenTelemetry Protocol (OTLP) over gRPC
+      - '4318'        # accept OpenTelemetry Protocol (OTLP) over HTTP
+      - '14268'       # accept jaeger.thrift directly from clients over HTTP
+      - '14269'       # admin HTTP server and metrics
+      - '16686:16686' # UI
+    restart: unless-stopped
     depends_on:
-      - postgres
-      - rabbit
-      - jaeger
\ No newline at end of file
+      - prometheus
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  ## Test web-app
+  #web-app:
+  #  depends_on:
+  #    - postgres
+  #    - rabbit
+  #    - jaeger
+  #  build:
+  #    dockerfile: Dockerfile.webtest
+  #    context: .
+  #  restart: unless-stopped
+  #  environment:
+  #    - ASPNETCORE_ENVIRONMENT=Development
+  #  ports:
+  #    - '5000:80'
\ No newline at end of file
diff --git a/jaeger-ui.json b/jaeger-ui.json
new file mode 100644
index 00000000..b2baccd0
--- /dev/null
+++ b/jaeger-ui.json
@@ -0,0 +1,8 @@
+{
+    "monitor": {
+        "menuEnabled": true
+    },
+    "dependencies": {
+        "menuEnabled": true
+    }
+}
\ No newline at end of file
diff --git a/otel-collector-config.yaml b/otel-collector-config.yaml
index 7a34ae83..cc07dce5 100644
--- a/otel-collector-config.yaml
+++ b/otel-collector-config.yaml
@@ -1,7 +1,3 @@
-extensions:
-  memory_ballast:
-    size_mib: 512
-
 receivers:
   otlp:
     protocols:
@@ -10,21 +6,55 @@ receivers:
 
 processors:
   batch:
-  memory_limiter:
-    limit_mib: 1024
-    spike_limit_mib: 512
-    check_interval: 5s
+  metricstransform/insert:
+    transforms:
+      - include: calls
+        match_type: strict
+        action: insert
+        new_name: calls_total
+        operations:
+        - action: update_label
+          label: span.name
+          new_label: operation
+      - include: duration
+        match_type: strict
+        action: insert
+        new_name: latency
+        operations:
+          - action: update_label
+            label: span.name
+            new_label: operation
 
 exporters:
-  jaeger:
-    endpoint: "jaeger:14250"
+  otlp:
+    endpoint: "jaeger:4317"
     tls:
       insecure: true
+  prometheus:
+    endpoint: "otel-collector:9464"
+    resource_to_telemetry_conversion:
+      enabled: true
+    enable_open_metrics: true
+
+connectors:
+  spanmetrics:
+    histogram:
+      explicit:
+        buckets: [100us, 1ms, 2ms, 6ms, 10ms, 100ms, 250ms]
+    dimensions:
+      - name: http.method
+        default: GET
+      - name: http.status_code
+    dimensions_cache_size: 1000
+    aggregation_temporality: "AGGREGATION_TEMPORALITY_CUMULATIVE"
 
 service:
-  extensions: [memory_ballast]
   pipelines:
     traces:
       receivers: [otlp]
-      processors: [memory_limiter, batch]
-      exporters: [jaeger]
\ No newline at end of file
+      processors: [batch]
+      exporters: [otlp, spanmetrics]
+    metrics:
+      receivers: [otlp, spanmetrics]
+      processors: [metricstransform/insert]
+      exporters: [prometheus]
\ No newline at end of file
diff --git a/postgres-exporter-queries.yaml b/postgres-exporter-queries.yaml
new file mode 100644
index 00000000..5c0148bc
--- /dev/null
+++ b/postgres-exporter-queries.yaml
@@ -0,0 +1,311 @@
+pg_replication:
+    query: |
+        SELECT
+          CASE
+            WHEN NOT pg_is_in_recovery() THEN 0
+            ELSE GREATEST(0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())))
+          END AS lag
+    master: true
+    metrics:
+        - lag:
+              usage: "GAUGE"
+              description: "Replication lag behind master in seconds"
+
+pg_postmaster:
+    query: |
+        SELECT
+          pg_postmaster_start_time as start_time_seconds
+        from pg_postmaster_start_time()
+    master: true
+    metrics:
+        - start_time_seconds:
+              usage: "GAUGE"
+              description: "Time at which postmaster started"
+
+pg_stat_user_tables:
+    query: |
+        SELECT
+          current_database() datname,
+          schemaname,
+          relname,
+          seq_scan,
+          seq_tup_read,
+          idx_scan,
+          idx_tup_fetch,
+          n_tup_ins,
+          n_tup_upd,
+          n_tup_del,
+          n_tup_hot_upd,
+          n_live_tup,
+          n_dead_tup,
+          n_mod_since_analyze,
+          COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
+          COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
+          COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
+          COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
+          vacuum_count,
+          autovacuum_count,
+          analyze_count,
+          autoanalyze_count
+        FROM
+          pg_stat_user_tables
+    metrics:
+        - datname:
+              usage: "LABEL"
+              description: "Name of current database"
+        - schemaname:
+              usage: "LABEL"
+              description: "Name of the schema that this table is in"
+        - relname:
+              usage: "LABEL"
+              description: "Name of this table"
+        - seq_scan:
+              usage: "COUNTER"
+              description: "Number of sequential scans initiated on this table"
+        - seq_tup_read:
+              usage: "COUNTER"
+              description: "Number of live rows fetched by sequential scans"
+        - idx_scan:
+              usage: "COUNTER"
+              description: "Number of index scans initiated on this table"
+        - idx_tup_fetch:
+              usage: "COUNTER"
+              description: "Number of live rows fetched by index scans"
+        - n_tup_ins:
+              usage: "COUNTER"
+              description: "Number of rows inserted"
+        - n_tup_upd:
+              usage: "COUNTER"
+              description: "Number of rows updated"
+        - n_tup_del:
+              usage: "COUNTER"
+              description: "Number of rows deleted"
+        - n_tup_hot_upd:
+              usage: "COUNTER"
+              description: "Number of rows HOT updated (i.e., with no separate index update required)"
+        - n_live_tup:
+              usage: "GAUGE"
+              description: "Estimated number of live rows"
+        - n_dead_tup:
+              usage: "GAUGE"
+              description: "Estimated number of dead rows"
+        - n_mod_since_analyze:
+              usage: "GAUGE"
+              description: "Estimated number of rows changed since last analyze"
+        - last_vacuum:
+              usage: "GAUGE"
+              description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
+        - last_autovacuum:
+              usage: "GAUGE"
+              description: "Last time at which this table was vacuumed by the autovacuum daemon"
+        - last_analyze:
+              usage: "GAUGE"
+              description: "Last time at which this table was manually analyzed"
+        - last_autoanalyze:
+              usage: "GAUGE"
+              description: "Last time at which this table was analyzed by the autovacuum daemon"
+        - vacuum_count:
+              usage: "COUNTER"
+              description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
+        - autovacuum_count:
+              usage: "COUNTER"
+              description: "Number of times this table has been vacuumed by the autovacuum daemon"
+        - analyze_count:
+              usage: "COUNTER"
+              description: "Number of times this table has been manually analyzed"
+        - autoanalyze_count:
+              usage: "COUNTER"
+              description: "Number of times this table has been analyzed by the autovacuum daemon"
+
+pg_statio_user_tables:
+    query: |
+        SELECT
+          current_database() datname,
+          schemaname,
+          relname,
+          heap_blks_read,
+          heap_blks_hit,
+          idx_blks_read,
+          idx_blks_hit,
+          toast_blks_read,
+          toast_blks_hit,
+          tidx_blks_read,
+          tidx_blks_hit
+        FROM pg_statio_user_tables;
+    metrics:
+        - datname:
+              usage: "LABEL"
+              description: "Name of current database"
+        - schemaname:
+              usage: "LABEL"
+              description: "Name of the schema that this table is in"
+        - relname:
+              usage: "LABEL"
+              description: "Name of this table"
+        - heap_blks_read:
+              usage: "COUNTER"
+              description: "Number of disk blocks read from this table"
+        - heap_blks_hit:
+              usage: "COUNTER"
+              description: "Number of buffer hits in this table"
+        - idx_blks_read:
+              usage: "COUNTER"
+              description: "Number of disk blocks read from all indexes on this table"
+        - idx_blks_hit:
+              usage: "COUNTER"
+              description: "Number of buffer hits in all indexes on this table"
+        - toast_blks_read:
+              usage: "COUNTER"
+              description: "Number of disk blocks read from this table's TOAST table (if any)"
+        - toast_blks_hit:
+              usage: "COUNTER"
+              description: "Number of buffer hits in this table's TOAST table (if any)"
+        - tidx_blks_read:
+              usage: "COUNTER"
+              description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
+        - tidx_blks_hit:
+              usage: "COUNTER"
+              description: "Number of buffer hits in this table's TOAST table indexes (if any)"
+
+# WARNING: This set of metrics can be very expensive on a busy server as every unique query executed will create an additional time series
+pg_stat_statements:
+    query: |
+        SELECT
+          t2.rolname,
+          t3.datname,
+          queryid,
+          calls,
+          total_exec_time / 1000     as total_time_seconds,
+          min_exec_time / 1000       as min_time_seconds,
+          max_exec_time / 1000       as max_time_seconds,
+          mean_exec_time / 1000      as mean_time_seconds,
+          stddev_exec_time / 1000    as stddev_time_seconds,
+          rows,
+          shared_blks_hit,
+          shared_blks_read,
+          shared_blks_dirtied,
+          shared_blks_written,
+          local_blks_hit,
+          local_blks_read,
+          local_blks_dirtied,
+          local_blks_written,
+          temp_blks_read,
+          temp_blks_written,
+          blk_read_time / 1000  as blk_read_time_seconds,
+          blk_write_time / 1000 as blk_write_time_seconds
+        FROM pg_stat_statements t1
+        JOIN pg_roles t2 ON (t1.userid = t2.oid)
+        JOIN pg_database t3 ON (t1.dbid = t3.oid)
+        WHERE t2.rolname != 'rdsadmin'
+    master: true
+    metrics:
+        - rolname:
+              usage: "LABEL"
+              description: "Name of user"
+        - datname:
+              usage: "LABEL"
+              description: "Name of database"
+        - queryid:
+              usage: "LABEL"
+              description: "Query ID"
+        - calls:
+              usage: "COUNTER"
+              description: "Number of times executed"
+        - total_time_seconds:
+              usage: "COUNTER"
+              description: "Total time spent in the statement, in milliseconds"
+        - min_time_seconds:
+              usage: "GAUGE"
+              description: "Minimum time spent in the statement, in milliseconds"
+        - max_time_seconds:
+              usage: "GAUGE"
+              description: "Maximum time spent in the statement, in milliseconds"
+        - mean_time_seconds:
+              usage: "GAUGE"
+              description: "Mean time spent in the statement, in milliseconds"
+        - stddev_time_seconds:
+              usage: "GAUGE"
+              description: "Population standard deviation of time spent in the statement, in milliseconds"
+        - rows:
+              usage: "COUNTER"
+              description: "Total number of rows retrieved or affected by the statement"
+        - shared_blks_hit:
+              usage: "COUNTER"
+              description: "Total number of shared block cache hits by the statement"
+        - shared_blks_read:
+              usage: "COUNTER"
+              description: "Total number of shared blocks read by the statement"
+        - shared_blks_dirtied:
+              usage: "COUNTER"
+              description: "Total number of shared blocks dirtied by the statement"
+        - shared_blks_written:
+              usage: "COUNTER"
+              description: "Total number of shared blocks written by the statement"
+        - local_blks_hit:
+              usage: "COUNTER"
+              description: "Total number of local block cache hits by the statement"
+        - local_blks_read:
+              usage: "COUNTER"
+              description: "Total number of local blocks read by the statement"
+        - local_blks_dirtied:
+              usage: "COUNTER"
+              description: "Total number of local blocks dirtied by the statement"
+        - local_blks_written:
+              usage: "COUNTER"
+              description: "Total number of local blocks written by the statement"
+        - temp_blks_read:
+              usage: "COUNTER"
+              description: "Total number of temp blocks read by the statement"
+        - temp_blks_written:
+              usage: "COUNTER"
+              description: "Total number of temp blocks written by the statement"
+        - blk_read_time_seconds:
+              usage: "COUNTER"
+              description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
+        - blk_write_time_seconds:
+              usage: "COUNTER"
+              description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
+
+pg_process_idle:
+    query: |
+        WITH
+          metrics AS (
+            SELECT
+              application_name,
+              SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
+              COUNT(*) AS process_idle_seconds_count
+            FROM pg_stat_activity
+            WHERE state = 'idle'
+            GROUP BY application_name
+          ),
+          buckets AS (
+            SELECT
+              application_name,
+              le,
+              SUM(
+                CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
+                  THEN 1
+                  ELSE 0
+                END
+              )::bigint AS bucket
+            FROM
+              pg_stat_activity,
+              UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
+            GROUP BY application_name, le
+            ORDER BY application_name, le
+          )
+        SELECT
+          application_name,
+          process_idle_seconds_sum as seconds_sum,
+          process_idle_seconds_count as seconds_count,
+          ARRAY_AGG(le) AS seconds,
+          ARRAY_AGG(bucket) AS seconds_bucket
+        FROM metrics JOIN buckets USING (application_name)
+        GROUP BY 1, 2, 3
+    metrics:
+        - application_name:
+              usage: "LABEL"
+              description: "Application Name"
+        - seconds:
+              usage: "HISTOGRAM"
+              description: "Idle time of server processes"
\ No newline at end of file
diff --git a/postgres-exporter.yaml b/postgres-exporter.yaml
new file mode 100644
index 00000000..93452f25
--- /dev/null
+++ b/postgres-exporter.yaml
@@ -0,0 +1,8 @@
+auth_modules:
+  postgres:
+    type: userpass
+    userpass:
+      username: postgres
+      password: Password12!
+    options:
+      sslmode: disable
\ No newline at end of file
diff --git a/postgres-init.sql b/postgres-init.sql
new file mode 100644
index 00000000..4f2fe92b
--- /dev/null
+++ b/postgres-init.sql
@@ -0,0 +1,2 @@
+create extension if not exists pg_stat_statements;
+create extension if not exists dblink;
\ No newline at end of file
diff --git a/prometheus-config.yaml b/prometheus-config.yaml
new file mode 100644
index 00000000..81e26fda
--- /dev/null
+++ b/prometheus-config.yaml
@@ -0,0 +1,12 @@
+global:
+  evaluation_interval: 1m # How frequently to evaluate rules.
+  scrape_interval: 5s # How frequently to scrape targets by default.
+  scrape_timeout: 5s # How long until a scrape request times out.
+scrape_configs:
+  - job_name: metrics
+    static_configs:
+      - targets:
+          - 'otel-collector:9464'    # open-telemetry metrics exporter
+          - 'otel-collector:8888'    # open-telemetry own metrics
+          - 'postgres-exporter:9187' # postgres metrics exporter
+          - 'jaeger:14269'           # jaeger metrics exporter
\ No newline at end of file
diff --git a/rabbitmq-enabled-plugins b/rabbitmq-enabled-plugins
new file mode 100644
index 00000000..318ea048
--- /dev/null
+++ b/rabbitmq-enabled-plugins
@@ -0,0 +1 @@
+[rabbitmq_management,rabbitmq_prometheus].
\ No newline at end of file