From 01c98a4e9812a3469c072d6c26f8bff8f258550d Mon Sep 17 00:00:00 2001 From: nikitagrishin Date: Wed, 29 Mar 2023 17:29:27 +0300 Subject: [PATCH] #200 --- docker-compose-webtest.yml | 147 ++++++++++++++-- jaeger-ui.json | 8 + otel-collector-config.yaml | 56 ++++-- postgres-exporter-queries.yaml | 311 +++++++++++++++++++++++++++++++++ postgres-exporter.yaml | 8 + postgres-init.sql | 2 + prometheus-config.yaml | 12 ++ rabbitmq-enabled-plugins | 1 + 8 files changed, 515 insertions(+), 30 deletions(-) create mode 100644 jaeger-ui.json create mode 100644 postgres-exporter-queries.yaml create mode 100644 postgres-exporter.yaml create mode 100644 postgres-init.sql create mode 100644 prometheus-config.yaml create mode 100644 rabbitmq-enabled-plugins diff --git a/docker-compose-webtest.yml b/docker-compose-webtest.yml index 9204aa61..28ce18bc 100644 --- a/docker-compose-webtest.yml +++ b/docker-compose-webtest.yml @@ -5,43 +5,156 @@ services: postgres: image: postgres:15.1-alpine container_name: postgres - restart: unless-stopped + command: postgres -c shared_preload_libraries=pg_stat_statements -c pg_stat_statements.track=all -c max_connections=200 + volumes: + - ./postgres-init.sql:/docker-entrypoint-initdb.d/postgres-init.sql environment: - POSTGRES_PASSWORD=Password12! ports: - '5432:5432' + healthcheck: + test: [ "CMD-SHELL", "pg_isready", "-d", "postgres", "-U", "postgres" ] + interval: 5s + timeout: 5s + retries: 5 + restart: unless-stopped + deploy: + resources: + limits: + memory: 512M + + # Postgres exporter + postgres-exporter: + image: quay.io/prometheuscommunity/postgres-exporter:v0.12.0 + container_name: postgres-exporter + command: + - --config.file=/etc/postgres-exporter/postgres-exporter.yaml + - --extend.query-path=/etc/postgres-exporter/postgres-exporter-queries.yaml + volumes: + - ./postgres-exporter.yaml:/etc/postgres-exporter/postgres-exporter.yaml + - ./postgres-exporter-queries.yaml:/etc/postgres-exporter/postgres-exporter-queries.yaml + environment: + - DATA_SOURCE_URI=postgres:5432?sslmode=disable + - DATA_SOURCE_USER=postgres + - DATA_SOURCE_PASS=Password12! + ports: + - '9187' + depends_on: + postgres: + condition: service_healthy + restart: unless-stopped + deploy: + resources: + limits: + memory: 512M # RabbitMQ rabbit: - image: rabbitmq:3.11.11-management-alpine + image: rabbitmq:3.11.11-alpine container_name: rabbit - restart: unless-stopped + volumes: + - ./rabbitmq-enabled-plugins:/etc/rabbitmq/enabled_plugins ports: - '15672:15672' - '5672:5672' - - # Jaeger - jaeger: - image: jaegertracing/all-in-one:1.43.0 - container_name: jaeger + healthcheck: + test: rabbitmq-diagnostics check_port_connectivity + interval: 5s + timeout: 5s + retries: 5 restart: unless-stopped + deploy: + resources: + limits: + memory: 512M + + # Prometheus + prometheus: + image: quay.io/prometheus/prometheus:v2.43.0 + container_name: prometheus + command: + - --web.console.templates=/etc/prometheus/consoles + - --web.console.libraries=/etc/prometheus/console_libraries + - --storage.tsdb.retention.time=1h + - --config.file=/etc/prometheus/prometheus-config.yaml + - --storage.tsdb.path=/prometheus + - --web.enable-lifecycle + - --web.route-prefix=/ + - --enable-feature=exemplar-storage + - --enable-feature=remote-write-receiver + volumes: + - ./prometheus-config.yaml:/etc/prometheus/prometheus-config.yaml ports: - - '5778:5778' # serve configs (sampling, etc.) - - '16686:16686' # serve frontend - - '14250:14250' # accept model.proto + - "9090:9090" + restart: unless-stopped + deploy: + resources: + limits: + memory: 512M # OpenTelemetry collector otel-collector: - image: otel/opentelemetry-collector:0.74.0 + image: otel/opentelemetry-collector-contrib:0.74.0 container_name: otel-collector - command: [ --config=/etc/otel-collector-config.yaml ] + command: + - --config=/etc/otel-collector-config.yaml + - --feature-gates=service.connectors volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml ports: - - '13133:13133' # health_check extension - '4317:4317' # OTLP gRPC receiver - '4318:4318' # OTLP http receiver + - "9464" # Prometheus exporter + - "8888" # metrics endpoint + restart: unless-stopped + deploy: + resources: + limits: + memory: 512M + + # Jaeger + jaeger: + image: jaegertracing/all-in-one:1.43.0 + container_name: jaeger + command: + - "--memory.max-traces" + - "1000" + - "--query.base-path" + - "/jaeger/ui" + - "--query.ui-config" + - "/etc/jaeger/jaeger-ui.json" + volumes: + - "./jaeger-ui.json:/etc/jaeger/jaeger-ui.json" + environment: + - COLLECTOR_OTLP_ENABLED=true + - METRICS_STORAGE_TYPE=prometheus + - PROMETHEUS_SERVER_URL=http://prometheus:9090 + - PROMETHEUS_TLS_Enabled=false + ports: + - '4317' # accept OpenTelemetry Protocol (OTLP) over gRPC + - '4318' # accept OpenTelemetry Protocol (OTLP) over HTTP + - '14268' # accept jaeger.thrift directly from clients over HTTP + - '14269' # admin HTTP server and metrics + - '16686:16686' # UI + restart: unless-stopped depends_on: - - postgres - - rabbit - - jaeger \ No newline at end of file + - prometheus + deploy: + resources: + limits: + memory: 512M + + ## Test web-app + #web-app: + # depends_on: + # - postgres + # - rabbit + # - jaeger + # build: + # dockerfile: Dockerfile.webtest + # context: . + # restart: unless-stopped + # environment: + # - ASPNETCORE_ENVIRONMENT=Development + # ports: + # - '5000:80' \ No newline at end of file diff --git a/jaeger-ui.json b/jaeger-ui.json new file mode 100644 index 00000000..b2baccd0 --- /dev/null +++ b/jaeger-ui.json @@ -0,0 +1,8 @@ +{ + "monitor": { + "menuEnabled": true + }, + "dependencies": { + "menuEnabled": true + } +} \ No newline at end of file diff --git a/otel-collector-config.yaml b/otel-collector-config.yaml index 7a34ae83..cc07dce5 100644 --- a/otel-collector-config.yaml +++ b/otel-collector-config.yaml @@ -1,7 +1,3 @@ -extensions: - memory_ballast: - size_mib: 512 - receivers: otlp: protocols: @@ -10,21 +6,55 @@ receivers: processors: batch: - memory_limiter: - limit_mib: 1024 - spike_limit_mib: 512 - check_interval: 5s + metricstransform/insert: + transforms: + - include: calls + match_type: strict + action: insert + new_name: calls_total + operations: + - action: update_label + label: span.name + new_label: operation + - include: duration + match_type: strict + action: insert + new_name: latency + operations: + - action: update_label + label: span.name + new_label: operation exporters: - jaeger: - endpoint: "jaeger:14250" + otlp: + endpoint: "jaeger:4317" tls: insecure: true + prometheus: + endpoint: "otel-collector:9464" + resource_to_telemetry_conversion: + enabled: true + enable_open_metrics: true + +connectors: + spanmetrics: + histogram: + explicit: + buckets: [100us, 1ms, 2ms, 6ms, 10ms, 100ms, 250ms] + dimensions: + - name: http.method + default: GET + - name: http.status_code + dimensions_cache_size: 1000 + aggregation_temporality: "AGGREGATION_TEMPORALITY_CUMULATIVE" service: - extensions: [memory_ballast] pipelines: traces: receivers: [otlp] - processors: [memory_limiter, batch] - exporters: [jaeger] \ No newline at end of file + processors: [batch] + exporters: [otlp, spanmetrics] + metrics: + receivers: [otlp, spanmetrics] + processors: [metricstransform/insert] + exporters: [prometheus] \ No newline at end of file diff --git a/postgres-exporter-queries.yaml b/postgres-exporter-queries.yaml new file mode 100644 index 00000000..5c0148bc --- /dev/null +++ b/postgres-exporter-queries.yaml @@ -0,0 +1,311 @@ +pg_replication: + query: | + SELECT + CASE + WHEN NOT pg_is_in_recovery() THEN 0 + ELSE GREATEST(0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) + END AS lag + master: true + metrics: + - lag: + usage: "GAUGE" + description: "Replication lag behind master in seconds" + +pg_postmaster: + query: | + SELECT + pg_postmaster_start_time as start_time_seconds + from pg_postmaster_start_time() + master: true + metrics: + - start_time_seconds: + usage: "GAUGE" + description: "Time at which postmaster started" + +pg_stat_user_tables: + query: | + SELECT + current_database() datname, + schemaname, + relname, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + n_mod_since_analyze, + COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, + COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, + COALESCE(last_analyze, '1970-01-01Z') as last_analyze, + COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count + FROM + pg_stat_user_tables + metrics: + - datname: + usage: "LABEL" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - seq_scan: + usage: "COUNTER" + description: "Number of sequential scans initiated on this table" + - seq_tup_read: + usage: "COUNTER" + description: "Number of live rows fetched by sequential scans" + - idx_scan: + usage: "COUNTER" + description: "Number of index scans initiated on this table" + - idx_tup_fetch: + usage: "COUNTER" + description: "Number of live rows fetched by index scans" + - n_tup_ins: + usage: "COUNTER" + description: "Number of rows inserted" + - n_tup_upd: + usage: "COUNTER" + description: "Number of rows updated" + - n_tup_del: + usage: "COUNTER" + description: "Number of rows deleted" + - n_tup_hot_upd: + usage: "COUNTER" + description: "Number of rows HOT updated (i.e., with no separate index update required)" + - n_live_tup: + usage: "GAUGE" + description: "Estimated number of live rows" + - n_dead_tup: + usage: "GAUGE" + description: "Estimated number of dead rows" + - n_mod_since_analyze: + usage: "GAUGE" + description: "Estimated number of rows changed since last analyze" + - last_vacuum: + usage: "GAUGE" + description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" + - last_autovacuum: + usage: "GAUGE" + description: "Last time at which this table was vacuumed by the autovacuum daemon" + - last_analyze: + usage: "GAUGE" + description: "Last time at which this table was manually analyzed" + - last_autoanalyze: + usage: "GAUGE" + description: "Last time at which this table was analyzed by the autovacuum daemon" + - vacuum_count: + usage: "COUNTER" + description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" + - autovacuum_count: + usage: "COUNTER" + description: "Number of times this table has been vacuumed by the autovacuum daemon" + - analyze_count: + usage: "COUNTER" + description: "Number of times this table has been manually analyzed" + - autoanalyze_count: + usage: "COUNTER" + description: "Number of times this table has been analyzed by the autovacuum daemon" + +pg_statio_user_tables: + query: | + SELECT + current_database() datname, + schemaname, + relname, + heap_blks_read, + heap_blks_hit, + idx_blks_read, + idx_blks_hit, + toast_blks_read, + toast_blks_hit, + tidx_blks_read, + tidx_blks_hit + FROM pg_statio_user_tables; + metrics: + - datname: + usage: "LABEL" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - heap_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table" + - heap_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table" + - idx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from all indexes on this table" + - idx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in all indexes on this table" + - toast_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table (if any)" + - toast_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table (if any)" + - tidx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table indexes (if any)" + - tidx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table indexes (if any)" + +# WARNING: This set of metrics can be very expensive on a busy server as every unique query executed will create an additional time series +pg_stat_statements: + query: | + SELECT + t2.rolname, + t3.datname, + queryid, + calls, + total_exec_time / 1000 as total_time_seconds, + min_exec_time / 1000 as min_time_seconds, + max_exec_time / 1000 as max_time_seconds, + mean_exec_time / 1000 as mean_time_seconds, + stddev_exec_time / 1000 as stddev_time_seconds, + rows, + shared_blks_hit, + shared_blks_read, + shared_blks_dirtied, + shared_blks_written, + local_blks_hit, + local_blks_read, + local_blks_dirtied, + local_blks_written, + temp_blks_read, + temp_blks_written, + blk_read_time / 1000 as blk_read_time_seconds, + blk_write_time / 1000 as blk_write_time_seconds + FROM pg_stat_statements t1 + JOIN pg_roles t2 ON (t1.userid = t2.oid) + JOIN pg_database t3 ON (t1.dbid = t3.oid) + WHERE t2.rolname != 'rdsadmin' + master: true + metrics: + - rolname: + usage: "LABEL" + description: "Name of user" + - datname: + usage: "LABEL" + description: "Name of database" + - queryid: + usage: "LABEL" + description: "Query ID" + - calls: + usage: "COUNTER" + description: "Number of times executed" + - total_time_seconds: + usage: "COUNTER" + description: "Total time spent in the statement, in milliseconds" + - min_time_seconds: + usage: "GAUGE" + description: "Minimum time spent in the statement, in milliseconds" + - max_time_seconds: + usage: "GAUGE" + description: "Maximum time spent in the statement, in milliseconds" + - mean_time_seconds: + usage: "GAUGE" + description: "Mean time spent in the statement, in milliseconds" + - stddev_time_seconds: + usage: "GAUGE" + description: "Population standard deviation of time spent in the statement, in milliseconds" + - rows: + usage: "COUNTER" + description: "Total number of rows retrieved or affected by the statement" + - shared_blks_hit: + usage: "COUNTER" + description: "Total number of shared block cache hits by the statement" + - shared_blks_read: + usage: "COUNTER" + description: "Total number of shared blocks read by the statement" + - shared_blks_dirtied: + usage: "COUNTER" + description: "Total number of shared blocks dirtied by the statement" + - shared_blks_written: + usage: "COUNTER" + description: "Total number of shared blocks written by the statement" + - local_blks_hit: + usage: "COUNTER" + description: "Total number of local block cache hits by the statement" + - local_blks_read: + usage: "COUNTER" + description: "Total number of local blocks read by the statement" + - local_blks_dirtied: + usage: "COUNTER" + description: "Total number of local blocks dirtied by the statement" + - local_blks_written: + usage: "COUNTER" + description: "Total number of local blocks written by the statement" + - temp_blks_read: + usage: "COUNTER" + description: "Total number of temp blocks read by the statement" + - temp_blks_written: + usage: "COUNTER" + description: "Total number of temp blocks written by the statement" + - blk_read_time_seconds: + usage: "COUNTER" + description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + - blk_write_time_seconds: + usage: "COUNTER" + description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + +pg_process_idle: + query: | + WITH + metrics AS ( + SELECT + application_name, + SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum, + COUNT(*) AS process_idle_seconds_count + FROM pg_stat_activity + WHERE state = 'idle' + GROUP BY application_name + ), + buckets AS ( + SELECT + application_name, + le, + SUM( + CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le + THEN 1 + ELSE 0 + END + )::bigint AS bucket + FROM + pg_stat_activity, + UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le + GROUP BY application_name, le + ORDER BY application_name, le + ) + SELECT + application_name, + process_idle_seconds_sum as seconds_sum, + process_idle_seconds_count as seconds_count, + ARRAY_AGG(le) AS seconds, + ARRAY_AGG(bucket) AS seconds_bucket + FROM metrics JOIN buckets USING (application_name) + GROUP BY 1, 2, 3 + metrics: + - application_name: + usage: "LABEL" + description: "Application Name" + - seconds: + usage: "HISTOGRAM" + description: "Idle time of server processes" \ No newline at end of file diff --git a/postgres-exporter.yaml b/postgres-exporter.yaml new file mode 100644 index 00000000..93452f25 --- /dev/null +++ b/postgres-exporter.yaml @@ -0,0 +1,8 @@ +auth_modules: + postgres: + type: userpass + userpass: + username: postgres + password: Password12! + options: + sslmode: disable \ No newline at end of file diff --git a/postgres-init.sql b/postgres-init.sql new file mode 100644 index 00000000..4f2fe92b --- /dev/null +++ b/postgres-init.sql @@ -0,0 +1,2 @@ +create extension if not exists pg_stat_statements; +create extension if not exists dblink; \ No newline at end of file diff --git a/prometheus-config.yaml b/prometheus-config.yaml new file mode 100644 index 00000000..81e26fda --- /dev/null +++ b/prometheus-config.yaml @@ -0,0 +1,12 @@ +global: + evaluation_interval: 1m # How frequently to evaluate rules. + scrape_interval: 5s # How frequently to scrape targets by default. + scrape_timeout: 5s # How long until a scrape request times out. +scrape_configs: + - job_name: metrics + static_configs: + - targets: + - 'otel-collector:9464' # open-telemetry metrics exporter + - 'otel-collector:8888' # open-telemetry own metrics + - 'postgres-exporter:9187' # postgres metrics exporter + - 'jaeger:14269' # jaeger metrics exporter \ No newline at end of file diff --git a/rabbitmq-enabled-plugins b/rabbitmq-enabled-plugins new file mode 100644 index 00000000..318ea048 --- /dev/null +++ b/rabbitmq-enabled-plugins @@ -0,0 +1 @@ +[rabbitmq_management,rabbitmq_prometheus]. \ No newline at end of file