From fe25b8ce2ceb49236b577c9002ce1818f9a0243b Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 14:49:28 +0200 Subject: [PATCH 1/9] feat: typos --- docker/alloy/config.alloy | 3 ++ docs/workshop.md | 74 +++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/docker/alloy/config.alloy b/docker/alloy/config.alloy index 22a0098..2e04ea5 100644 --- a/docker/alloy/config.alloy +++ b/docker/alloy/config.alloy @@ -90,6 +90,8 @@ otelcol.receiver.otlp "default" { } } +//SAMPLING +/* otelcol.processor.tail_sampling "actuator" { policy { name = "filter_http_url" @@ -117,6 +119,7 @@ otelcol.processor.tail_sampling "actuator" { traces = [otelcol.processor.batch.default.input] } } +*/ otelcol.processor.batch "default" { output { diff --git a/docs/workshop.md b/docs/workshop.md index 762788c..66b520d 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -25,7 +25,6 @@ During this workshop we will use the Grafana stack and Prometheus: We will also cover the OpenTelemetry Collector which gathers & broadcasts then the data coming from our microservices ## Workshop overview -Duration: 0:02:00 ### Application High Level Design @@ -43,7 +42,7 @@ This is how it validates every payment: 3. Check the credit card type 4. Check the payment threshold, it calls the Smart Bank Gateway for authorization -If the payment is validated it stores it and broadcasts it to all the other microservices through Kafka. +If the payment is validated, it stores it and broadcasts it to all the other microservices through Kafka. #### Fraud detection Service @@ -67,7 +66,7 @@ As mentioned earlier, our observability stack is composed of : * [Loki](https://grafana.com/oss/loki/) for storing the logs * [Tempo](https://grafana.com/oss/tempo/) for storing the traces * [Grafana](https://grafana.com/) for the dashboards -* The [OTEL collector](https://opentelemetry.io/docs/collector/) which gathers all the data to send it then to +* [GRAFANA Alloy - OTEL collector](https://grafana.com/docs/alloy/latest/) which gathers all the data to send it then to In addition, the microservices are started with an agent to broadcast the traces to the collector. @@ -136,16 +135,13 @@ Docker Compose version v2.24.7 ``` #### If you don't want to bother with a local setup - -##### With Gitpod (recommended) -You can use [Gitpod](https://gitpod.io). +It's strongly recommended to use [Gitpod](https://gitpod.io). You must create an account first. You then can open this project in either your local VS Code or directly in your browser: [![Open in Gitpod](img/open-in-gitpod.svg)](https://gitpod.io/#github.com/worldline/observability-workshop.git) ## Environment Setup -Duration: 0:05:00 ### Open GitPod @@ -162,7 +158,7 @@ The "infrastructure stack" is composed of the following components: * One [Configuration server](https://docs.spring.io/spring-cloud-config/) is also used to centralise the configuration of our microservices. * The following microservices: API Gateway, Merchant BO, Fraud Detect, Smart Bank Gateway -If you run your application on GitPod, the following step is automatically started at the startup. +If you run your application on GitPod, the following step are automatically started during the provisioning of your GitPod environment. Otherwise, to run it on your desktop, execute the following commands @@ -184,12 +180,37 @@ $ docker compose ps -a ``` And check the status of every service. +For instance: + +```bash +❯ docker compose ps +NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS +api-gateway api-gateway:latest "java -javaagent:/ap…" api-gateway 8 minutes ago Up 7 minutes (healthy) 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +config-server config-server:latest "java -javaagent:/ap…" config-server 8 minutes ago Up 7 minutes (healthy) 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp +discovery-server discovery-server:latest "java -javaagent:/ap…" discovery-server 8 minutes ago Up 7 minutes (healthy) 0.0.0.0:8761->8761/tcp, :::8761->8761/tcp +easypay-service easypay-service:latest "java -javaagent:/ap…" easypay-service 8 minutes ago Up 7 minutes (healthy) +fraudetect fraudetect-service:latest "java -javaagent:/ap…" fraudetect-service 8 minutes ago Up 7 minutes (healthy) +kafka confluentinc/cp-kafka:7.6.1 "/etc/confluent/dock…" kafka 8 minutes ago Up 8 minutes (healthy) 9092/tcp, 0.0.0.0:19092->19092/tcp, :::19092->19092/tcp +merchant-backoffice merchant-backoffice:latest "java -javaagent:/ap…" merchant-backoffice 8 minutes ago Up 7 minutes (healthy) +observability-workshop-collector-1 grafana/alloy:latest "/bin/alloy run --se…" collector 8 minutes ago Up 8 minutes 0.0.0.0:4317-4318->4317-4318/tcp, :::4317-4318->4317-4318/tcp, 0.0.0.0:12345->12345/tcp, :::12345->12345/tcp +observability-workshop-grafana-1 grafana/grafana:latest "sh -xeuc 'mkdir -p …" grafana 8 minutes ago Up 7 minutes 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp +observability-workshop-loki-1 grafana/loki:latest "/usr/bin/loki -conf…" loki 8 minutes ago Up 7 minutes 0.0.0.0:3100->3100/tcp, :::3100->3100/tcp +observability-workshop-postgres-easypay-1 postgres:16 "docker-entrypoint.s…" postgres-easypay 8 minutes ago Up 8 minutes (healthy) 0.0.0.0:5432->5432/tcp, :::5432->5432/tcp +observability-workshop-postgres-fraudetect-1 postgres:16 "docker-entrypoint.s…" postgres-fraudetect 8 minutes ago Up 7 minutes (healthy) 0.0.0.0:5434->5432/tcp, :::5434->5432/tcp +observability-workshop-postgres-merchantbo-1 postgres:16 "docker-entrypoint.s…" postgres-merchantbo 8 minutes ago Up 8 minutes (healthy) 0.0.0.0:5435->5432/tcp, :::5435->5432/tcp +observability-workshop-postgres-smartbank-1 postgres:16 "docker-entrypoint.s…" postgres-smartbank 8 minutes ago Up 7 minutes (healthy) 0.0.0.0:5433->5432/tcp, :::5433->5432/tcp +observability-workshop-prometheus-1 prom/prometheus:v2.52.0 "/bin/prometheus --c…" prometheus 8 minutes ago Up 8 minutes 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp +observability-workshop-tempo-1 grafana/tempo:latest "/tempo -config.file…" tempo 8 minutes ago Up 7 minutes 0.0.0.0:3200->3200/tcp, :::3200->3200/tcp, 0.0.0.0:9095->9095/tcp, :::9095->9095/tcp, 0.0.0.0:9411->9411/tcp, :::9411->9411/tcp, 0.0.0.0:14268->14268/tcp, :::14268->14268/tcp +smartbank-gateway smartbank-gateway:latest "java -Xmx4g -javaag…" smartbank-gateway 8 minutes ago Up 7 minutes (unhealthy) + +``` + #### Validation Open the [Eureka](https://cloud.spring.io/spring-cloud-netflix/) website started during the infrastructure setup. -If you run this workshop on your desktop, you can go to this URL: http://localhost:8761. -If you run it on GitPod, you can go to the corresponding URL (e.g., https://8761-worldline-observability-w98vrd59k5h.ws-eu114.gitpod.io) instead. +* If you run this workshop on your desktop, you can go to this URL: http://localhost:8761. +* If you run it on GitPod, you can go to the corresponding URL (e.g., https://8761-worldline-observability-w98vrd59k5h.ws-eu114.gitpod.io) instead. You can now reach our platform to initiate a payment: @@ -223,7 +244,6 @@ transfer-encoding: chunked ``` ## Logs -Duration: 0:30:00 ### Some functional issues One of our customers raised an issue: @@ -666,7 +686,6 @@ Finally, you can search logs based on the correlation ID ## Metrics -Duration: 0:30:00 Let’s take control of our application’s metrics! @@ -1157,7 +1176,6 @@ k6 -u 2 -d 2m k6/01-payment-only.js > Just hover the panel you are interested in, click on the three dots and select Edit. ## Traces -Duration: 0:20:00 Stop the easypay service. @@ -1208,21 +1226,20 @@ To avoid storing useless data into Tempo, we can sample the data in two ways: In this workshop, we will implement the latter. -In the alloy configuration file (``docker/alloy/config.alloy``), put this configuration just after the ``SAMPLING`` comment: +In the alloy configuration file (``docker/alloy/config.alloy``), uncomment this configuration just after the ``SAMPLING`` comment: ``` // SAMPLING -// otelcol.processor.tail_sampling "actuator" { -policy { -name = "filter_http_url" -type = "string_attribute" -string_attribute { -key = "http.url" -values = ["/actuator/health", "/actuator/prometheus"] -enabled_regex_matching = true -invert_match = true -} -} + policy { + name = "filter_http_url" + type = "string_attribute" + string_attribute { + key = "http.url" + values = ["/actuator/health", "/actuator/prometheus"] + enabled_regex_matching = true + invert_match = true + } + } policy { name = "filter_url_path" @@ -1234,6 +1251,11 @@ invert_match = true invert_match = true } } + + output { + traces = [otelcol.processor.batch.default.input] + } +} ``` This configuration will filter the [SPANs](https://opentelemetry.io/docs/concepts/signals/traces/#spans) created from ``/actuator`` API calls. @@ -1245,8 +1267,6 @@ $ docker compose restart collector ``` ## Correlate Traces, Logs -Duration: 0:15:00 - Let's go back to the Grafana explore dashboard. Select the ``Loki`` datasource From abc04c29e029a320030b8e80f40535c51aee6429 Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 15:15:56 +0200 Subject: [PATCH 2/9] feat: Typos --- docs/workshop.md | 10 +++++++--- .../easypay/payment/boundary/PaymentResource.java | 12 ++++++------ .../easypay/payment/control/PosValidator.java | 8 ++++---- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index 66b520d..c29f73f 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -149,6 +149,8 @@ We will assume you will use GitPod for this workshop :) [![Open in Gitpod](img/open-in-gitpod.svg)](https://gitpod.io/#github.com/worldline/observability-workshop.git) +When a messages invites you making an URL public, select and validate it. + ### Start the infrastructure The "infrastructure stack" is composed of the following components: @@ -521,7 +523,11 @@ return httpResponse; ``` -Go to the MDC spring profile configuration file (``easypay-service/src/main/resources/application-mdc.properties``) and check the configuration got both the ``CardNumber`` & ``POS``fields. +Go to the MDC spring profile configuration file (``easypay-service/src/main/resources/application-mdc.properties``) and check the configuration of both the ``CardNumber`` & ``POS``fields. + +```properties +[...] %clr(CardNumber=){faint}%clr(%X{CardNumber:-null}) %clr(POS=){faint}%clr(%X{POS:-null}) [...] +``` Activate the ``mdc`` profile in the ``compose.yml`` file: @@ -568,7 +574,6 @@ $ docker compose down $ docker compose up -d --build --remove-orphans ``` - > aside positive > > During this workshop, we will only obfuscate the card numbers in Loki. It will therefore be stored as is in the log files but obfuscated in Loki and by this way in the data exposed on Grafana. @@ -666,7 +671,6 @@ Select the Loki datasource. In the label filter, select the application as ``easypay-service`` and click on ``Run Query``. - Add then a JSON parser operation , click on ``Run query`` again and check out the logs. Additionally, you can add these expressions in the JSON parser operation box: diff --git a/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java b/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java index f5a810c..f190010 100644 --- a/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java +++ b/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java @@ -42,15 +42,15 @@ public PaymentResource(PaymentService paymentService) { @GetMapping @Operation(description = "List all payments that have been processed", summary = "List all payments") public ResponseEntity> findAll() { - LOG.info("Request: get all processed payments"); +// LOG.info("Request: get all processed payments"); return ResponseEntity.ok(paymentService.findAll()); } @GetMapping("count") @Operation(description = "Count all payments", summary = "Count payments") public ResponseEntity count() { - LOG.info("Request: get number of processed payments"); - return ResponseEntity.ok(paymentService.count()); +// LOG.info("Request: get number of processed payments"); +// return ResponseEntity.ok(paymentService.count()); } @GetMapping("{id}") @@ -59,14 +59,14 @@ public ResponseEntity count() { @ApiResponse(responseCode = "204", description = "Payment not found", content = @Content(mediaType = "text/plain")) public ResponseEntity findById( @Parameter(description = "The payment id to be retrieved", required = true) @PathVariable("id") String paymentId) { - LOG.info("Request: get payment by id: {}", paymentId); +// LOG.info("Request: get payment by id: {}", paymentId); UUID id = UUID.fromString(paymentId); var payment = paymentService.findById(id); if (payment.isEmpty()) { - LOG.warn("Payment with id {} not found.", paymentId); +// LOG.warn("Payment with id {} not found.", paymentId); return ResponseEntity.notFound().build(); } - LOG.debug("Response: found payment: {}", payment.get()); +// LOG.debug("Response: found payment: {}", payment.get()); return ResponseEntity.ok(payment.get()); } diff --git a/easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java b/easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java index a64ff87..5d2d933 100644 --- a/easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java +++ b/easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java @@ -27,15 +27,15 @@ public boolean isActive(String posId) { List posList = posRefRepository.findAll(Example.of(probe)); if (posList.isEmpty()) { - LOG.warn( "Check POS does not pass: unknown posId {}", posId); +// LOG.warn( "Check POS does not pass: unknown posId {}", posId); return false; } boolean result = posList.get(0).active; - if (!result) { - LOG.warn( "Check POS does not pass: inactive posId {}", posId); - } +// if (!result) { +// LOG.warn( "Check POS does not pass: inactive posId {}", posId); +// } return result; From 4b366f827c827320c64f3a62d5d291eabeb19a31 Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 15:18:28 +0200 Subject: [PATCH 3/9] feat: Typos --- docs/workshop.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index c29f73f..d033ff2 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -814,9 +814,9 @@ Now explore again the targets (``Status`` > ``Targets``) on the Prometheus dashb * Select the ``Prometheus`` datasource instead of the ``Loki`` one. -In this section you will hands on the metrics query builder of Grafana. +In this section you will hand on the metrics query builder of Grafana. -The ``Metric`` field lists all the metrics available in Prometheus server: take time to explore them. +The ``Metric`` field lists all the metrics available in the Prometheus server: take time to explore them. * For example, you can select the metric named ``jvm_memory_used_bytes``, and click on the ``Run query`` button to plot the memory usage of all your services by memory area, @@ -854,13 +854,13 @@ To import these dashboards: > Imported dashboards are available directly from the ``Dashboards`` section of Grafana. Explore the ``JVM Micrometer`` dashboard: it works almost out of box. -It contains lot of useful information about JVMs running our services. +It contains a lot of useful information about JVMs running our services. The ``application`` filter (top of the dashboard) let you select the service you want to explore metrics. ### Incident! -Now let's simulate some traffic using Grafana K6. +Now let's simulate some traffic using [Grafana K6](https://k6.io/). Run the following command: @@ -962,7 +962,7 @@ public class PaymentService { ``` 1. Declare the two timers, 2. Injects the ``MeterRegistry`` provided by Spring Boot Actuator in the class constructor, as it is required to initialize the timers, -3. Intitialize the two timers by giving them a name (4), a description (5) and adding them to the meter registry. +3. Initialize the two timers by giving them a name (4), a description (5) and adding them to the meter registry. #### 2. Record time spent in the methods @@ -1285,7 +1285,7 @@ They will help us correlate our different requests logs and traces. > These notions are part of the [W3C Trace Context Specification](https://www.w3.org/TR/trace-context/). Now, go below in the Fields section. -You should see a ``Links`` sub-section with a ``View Trace`` button. +You should see a ``Links`` subsection with a ``View Trace`` button. Click on it. You will see the corresponding trace of this log. From b48bd2e339a58118945a4118ac8d7cfce21c0c80 Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Mon, 1 Jul 2024 14:26:40 +0000 Subject: [PATCH 4/9] feat: remove java instrumentation from easypay-service dockerfile --- easypay-service/src/main/docker/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/easypay-service/src/main/docker/Dockerfile b/easypay-service/src/main/docker/Dockerfile index 7719a34..9137bdd 100644 --- a/easypay-service/src/main/docker/Dockerfile +++ b/easypay-service/src/main/docker/Dockerfile @@ -34,6 +34,4 @@ RUN apk add --no-cache curl USER javauser -COPY instrumentation/grafana-opentelemetry-java.jar /app/grafana-opentelemetry-java.jar - -ENTRYPOINT ["java", "-javaagent:/app/grafana-opentelemetry-java.jar", "-cp","app:app/lib/*","com.worldline.easypay.EasypayServiceApplication"] \ No newline at end of file +ENTRYPOINT ["java", "-cp","app:app/lib/*","com.worldline.easypay.EasypayServiceApplication"] \ No newline at end of file From 1ce8637988b62cea94e774298141e67630320e7f Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Mon, 1 Jul 2024 18:44:49 +0000 Subject: [PATCH 5/9] feat: add opentelemetry dependency for annotations --- easypay-service/build.gradle.kts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/easypay-service/build.gradle.kts b/easypay-service/build.gradle.kts index 0e0282b..c9d53a1 100644 --- a/easypay-service/build.gradle.kts +++ b/easypay-service/build.gradle.kts @@ -56,9 +56,10 @@ dependencies { implementation("ch.qos.logback.contrib:logback-json-classic:0.1.5") implementation("ch.qos.logback.contrib:logback-jackson:0.1.5") - // Add opentelemetry exemplars support + // Add opentelemetry support implementation(platform("io.opentelemetry:opentelemetry-bom:1.38.0")) implementation("io.opentelemetry:opentelemetry-api") + implementation("io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:2.5.0") implementation("io.prometheus:prometheus-metrics-tracer-otel-agent:1.3.1") developmentOnly("org.springframework.boot:spring-boot-devtools") From 1df12744cdac7a39491fae2d71f87ffa51247bf5 Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Mon, 1 Jul 2024 19:17:55 +0000 Subject: [PATCH 6/9] feat: traces documentation with custom traces --- docs/workshop.md | 360 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 330 insertions(+), 30 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index d033ff2..4af08ab 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -142,6 +142,7 @@ You then can open this project in either your local VS Code or directly in your [![Open in Gitpod](img/open-in-gitpod.svg)](https://gitpod.io/#github.com/worldline/observability-workshop.git) ## Environment Setup +Duration: 0:05:00 ### Open GitPod @@ -246,6 +247,7 @@ transfer-encoding: chunked ``` ## Logs +Duration: 0:30:00 ### Some functional issues One of our customers raised an issue: @@ -1180,59 +1182,234 @@ k6 -u 2 -d 2m k6/01-payment-only.js > Just hover the panel you are interested in, click on the three dots and select Edit. ## Traces +In this section, we'll explore **distributed tracing**, the third pillar of application observability. -Stop the easypay service. +Distributed tracing is an essential tool for monitoring and analyzing the performance of complex applications. It tracks the flow of requests across multiple services and components, helping to identify bottlenecks and improve efficiency — particularly useful for intricate systems like Easypay. -Open the ``easypay.sh`` script file. You will then how is configured the JVM startup with the ``-javaagent`` parameter. +With Spring Boot, there are a couple of approaches to incorporate distributed tracing into your application: +* Utilize the [Spring Boot Actuator integration](https://docs.spring.io/spring-boot/docs/current/reference/html/actuator.html#actuator.tracing) with support from [Micrometer Tracing](https://docs.micrometer.io/docs/tracing), +* Or adopt a broader [Java Agent approach](https://github.com/open-telemetry/opentelemetry-java-instrumentation) provided by the OpenTelemetry project, which automatically instruments our code when attached to our JVM. -```shell -#!/usr/bin/env bash +For this workshop, we'll use the Java Agent method and, with a focus on Grafana, we will employ their version of the [OpenTelemetry Java Agent](https://github.com/grafana/grafana-opentelemetry-java). -export OTEL_SERVICE_NAME=easypay-service -export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4317" -export OTEL_EXPORTER_OTLP_PROTOCOL=grpc -export OTEL_RESOURCE_ATTRIBUTES="source=agent" +The Grafana Alloy collector will be used once again, tasked with receiving traces and forwarding them to the Tempo backend. -export SERVER_PORT=8081 -export LOGS_DIRECTORY="$(pwd)/logs" +> aside positive +> +> Utilizing collectors offers several advantages for managing telemetry data: +> - Reduces the need for complicated application configurations: just send data to `localhost`, +> - Centralizes configuration to a single point: the collector, +> - Acts as a buffer to prevent resource overuse, +> - Can transform data before ingestion, +> - Supports data intake from various protocols and can relay them to any backend, +> - ... + +Lastly, we will use Grafana to examine and interpret these traces, allowing us to better understand and optimize our application's performance. + +### Enable distributed tracing + +To capture the entire transaction across all services in a trace, it's essential to instrument all the services in our application. + +> aside positive +> +> In this workshop, our primary focus will be on the `easypay` service. +> For efficiency, we have already instrumented the other services beforehand. + +#### Download Grafana Opentelemetry Java Agent + +If you're using *GitPod*, the Java Agent should already be available in the `instrumentation/grafana-opentelemetry-java.jar` directory. + +🛠️ If you are participating in this workshop on your workstation, or if the file is missing, you can run the following script to download it: + +```bash +bash -x scripts/download-agent.sh +``` + +#### Enable Java Agent + +📝 Since we are deploying the easypay-service using *Docker*, we need to modify the last lines of the `easypay-service/src/main/docker/Dockerfile`: + +```Dockerfile +# ... +USER javauser + +# Copy Java Agent into the container +COPY instrumentation/grafana-opentelemetry-java.jar /app/grafana-opentelemetry-java.jar + +# Add the -javagent flag to setup the JVM to start with our Java Agent +ENTRYPOINT ["java", "-javaagent:/app/grafana-opentelemetry-java.jar", "-cp","app:app/lib/*","com.worldline.easypay.EasypayServiceApplication"] # (2) +``` + +The ENTRYPOINT instruction specifies the default command that will be executed when the container starts. + +🛠️ You can now build the updated easypay-service container image: + +```bash +docker compose build easypay-service +``` + +#### Configure Grafana Alloy + +It's time to set up *Grafana Alloy* for handling telemetry data. We will configure it to accept traces through the OpenTelemetry GRPC protocol (OTLP) on port `4317`, and then forward them to *Grafana Tempo*, which listens on the host `tempo` on the same port `4317` (this setup specifically handles OTLP traces). + +📝 Please add the following configuration to the `docker/alloy/config.alloy` file: + +```terraform +// ... + +// RECEIVER SETUP (OTLP GRPC) (1) +otelcol.receiver.otlp "default" { + grpc { + endpoint = "0.0.0.0:4317" + } + + output { + traces = [otelcol.processor.batch.default.input] + } +} + +// BATCH PROCESSING FOR OPTIMIZATION (2) +otelcol.processor.batch "default" { + output { + traces = [otelcol.exporter.otlp.tempo.input] + } +} + +// TRACE EXPORTING TO TEMPO (OTLP) (3) +otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo:4317" + + tls { + insecure = true + } + } +} +``` +1. Setting up the [``otelcol.receiver.otlp``](https://grafana.com/docs/alloy/latest/reference/components/otelcol.receiver.otlp/) receiver to accept telemetry data over the OTEL protocol via GRPC, listening on port `4317`, +2. Configuring the [processor](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.batch/) to batch traces efficiently, reducing resource usage, +3. Establishing the [``otelcol.exporter.otlp``](https://grafana.com/docs/alloy/latest/reference/components/otelcol.exporter.otlp/) exporter to send collected telemetry data to the Grafana Tempo service. + +ℹ️ The Grafana OpenTelemetry Java Agent is pre-configured to transmit telemetry data directly to the collector. This setup is facilitated through environment variables specified in the `compose.yml` file: + +```yaml +services: + # ... + easypay-service: + # .. + environment: + # ... + OTEL_SERVICE_NAME: easypay-service (1) + OTEL_EXPORTER_OTLP_ENDPOINT: http://collector:4317 (2) + OTEL_EXPORTER_OTLP_PROTOCOL: grpc (3) + # ... +``` +1. `OTEL_SERVICE_NAME` defines a service name which will be attached to traces to identify the instrumented service, +2. `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable configures where the telemetry data should be sent, +3. `OTEL_EXPORTER_OTLP_PROTOCOL` sets the OTLP protocol used behind, here GRPC (can be HTTP). + +> aside positive +> +> Find more information about how to configure the OpenTelemetry Java Agent in [its official documentation](https://opentelemetry.io/docs/languages/java/configuration/). + +🛠️ To apply the new settings, restart Grafana Alloy with the following command: + +```bash +docker compose restart collector +``` + +✅ After restarting, verify that Grafana Alloy is up and running with the updated configuration by accessing the Alloy dashboard on port ``12345``. -java -Xms512m -Xmx512m -javaagent:$(pwd)/instrumentation/grafana-opentelemetry-java.jar -jar "$(pwd)/easypay-service/build/libs/easypay-service-0.0.1-SNAPSHOT.jar" "$@" +🛠️ Redeploy the updated ``easypay-service``: + +```bash +docker compose up -d easypay-service +``` + +✅ To ensure easypay-service has started up correctly, check its logs with: + +```bash +docker compose logs -f easypay-service ``` -During this workshop, we will use an OpenTelemetry agent for broadcasting traces through Alloy to Tempo. +#### Explore Traces with Grafana + +> aside positive +> +> For this workshop, we've already configured the Tempo datasource in Grafana. +> You can take a look at its configuration in Grafana (available on port ``3000``) by navigating to the `Connections` > `Data sources` section. +> Similar to Prometheus, the configuration is quite straightforward as we only need to set up the Tempo server URL. + +🛠️ Generate some load on the application to produce traces: -Check the environment variables used: +```bash +k6 run -u 1 -d 5m k6/01-payment-only.js +``` -* ``OTEL_SERVICE_NAME`` -* ``OTEL_EXPORT_OTLP_ENDPOINT`` -* ``OTEL_EXPORT_OTLP_PROTOCOL`` -* ``OTEL_EXPORT_ATTRIBUTES`` +🛠️ Let’s explore your first traces in Grafana: +* Go to Grafana and open an ``Explore`` dashboard, +* Select the `Tempo` data source and click on ``Run query`` to refresh the view. + +> aside negative +> +> You may need to wait one or two minutes to allow Tempo to ingest some traces… -Now open a new explore Grafana dashboard. +👀 Click on `Service Graph` and explore the `Node graph`: this view is extremely helpful for visualizing and understanding how our services communicate with each other. -Select the Tempo datasource. +👀 Go back to `Search` and click on `Run query`. You should see a table named `Table - Traces`. +By default, this view provides the most recent traces available in *Tempo*. -Look around the node graph, pinpoint what are the different nodes and corresponding response times. +🛠️ Let's find an interesting trace using the query builder: +* Look at all traces corresponding to a POST to `easypay-service` with a duration greater than 50 ms: + * Span Name: `POST easypay-service` + * Duration: `trace` `>` `50ms` + * You can review the generated query, which uses a syntax called TraceQL. +* Click on `Run query`. +* Sort the table by `Duration` (click on the column name) to find the slowest trace. +* Drill down a `Trace ID`. -Create a query, select service name as ``easypay-service``. -Click on ``Run query`` and Drill down a Trace ID to get the full stack of the corresponding transaction. +You should see the full stack of the corresponding transaction. -Explore the corresponding SQL queries and their response times. +👀 Grafana should open a new view (you can enlarge it by clicking on the three vertical dots and selecting `Widen pane`): +* Pinpoint the different nodes and their corresponding response times: + * Each line is a span and corresponds to the time spent in a method/event. +* Examine the SQL queries and their response times. +* Discover that distributed tracing can link transactions through: + * HTTP (`api-gateway` to `easypay-service` and `easypay-service` to `smartbank-gateway`). + * Kafka (`easypay-service` to `fraudetect-service` and `merchant-backoffice`). +* Click on `Node graph` to get a graphical view of all the spans participating in the trace. -Finally, check the traces from different services (e.g., ``api-gateway``). +🛠️ Continue your exploration in the `Search` pane: +* For example, you can add the `Status` `=` `error` filter to see only traces that contain errors. ### Sampling -To avoid storing useless data into Tempo, we can sample the data in two ways: +When we instrument our services using the agent, every interaction, including Prometheus calls to the `actuator/prometheus` endpoint, is recorded. + +To avoid storing unnecessary data in Tempo, we can sample the data in two ways: * [Head Sampling](https://opentelemetry.io/docs/concepts/sampling/#head-sampling) -* [Tail Sampling](https://opentelemetry.io/docs/concepts/sampling/#head-sampling) +* [Tail Sampling](https://opentelemetry.io/docs/concepts/sampling/#tail-sampling) -In this workshop, we will implement the latter. +In this workshop, we will implement Tail Sampling. In the alloy configuration file (``docker/alloy/config.alloy``), uncomment this configuration just after the ``SAMPLING`` comment: +Modify the Alloy configuration file (``docker/alloy/config.alloy``) as follows: ``` // SAMPLING +// ... +// RECEIVER (OTLP) +otelcol.receiver.otlp "default" { + grpc { + endpoint = "0.0.0.0:4317" + } + + output { + traces = [otelcol.processor.tail_sampling.actuator.input] // (1) + } +} + +// TAIL SAMPLING (2) otelcol.processor.tail_sampling "actuator" { policy { name = "filter_http_url" @@ -1244,7 +1421,19 @@ otelcol.processor.tail_sampling "actuator" { invert_match = true } } + // Filter on http.url attribute (3) + policy { + name = "filter_http_url" + type = "string_attribute" + string_attribute { + key = "http.url" + values = ["/actuator/health", "/actuator/prometheus"] + enabled_regex_matching = true + invert_match = true + } + } + // Filter on url.path attribute (3) policy { name = "filter_url_path" type = "string_attribute" @@ -1257,19 +1446,130 @@ otelcol.processor.tail_sampling "actuator" { } output { - traces = [otelcol.processor.batch.default.input] + traces = [otelcol.processor.batch.default.input] // (4) } } ``` +1. Modify the output of the `otelcol.receiver.otlp` to export traces to the [otelcol.processor.tail_sampling](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.tail_sampling/) component defined just after. +2. Create a new `otelcol.processor.tail_sampling` component. +3. Configure it with two policies based on span attributes. +4. Export non-filtered spans to the `otelcol.processor.batch` processor we defined previously. -This configuration will filter the [SPANs](https://opentelemetry.io/docs/concepts/signals/traces/#spans) created from ``/actuator`` API calls. +This configuration will filter the [SPANs](https://opentelemetry.io/docs/concepts/signals/traces/#spans) created from `/actuator` API calls. -Restart then Alloy. +🛠️ Restart the Alloy collector: ```bash $ docker compose restart collector ``` +Starting from this moment, you should no longer see traces related to `actuator/health` or `actuator/prometheus` endpoints. + +### Custom Traces + +Just like metrics, it is also possible to add your own spans on arbitrary methods to provide more business value to the observability of your application. + +Let’s return to our code! + +#### Objectives + +We want to add new spans to the traces generated in the `easypay-service` application to track payment processing and store events. + +To achieve this goal, we will create new spans when the `process` and `store` methods of the `com.worldline.easypay.payment.control.PaymentService` class in the `easypay-service` module are invoked. + +As a reminder, this class is the central component responsible for processing payments. It provides the public method `accept`, which delegates its responsibilities to two private methods: +* `process`: which handles all the processing of the payment, including validation and calling third parties. +* `store`: which saves the processing result in the database. + +#### 1. Add Required Dependencies + +We need to add the `io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations` dependency to our module to access some useful annotations. + +👀 This has already been done in advance for this workshop. The following dependencies were added to the Gradle build file (`build.gradle.kts`) of the `easypay-service` module: + +```kotlin +dependencies { + //... + + // Add opentelemetry support + implementation(platform("io.opentelemetry:opentelemetry-bom:1.38.0")) + implementation("io.opentelemetry:opentelemetry-api") + implementation("io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:2.5.0") + + // ... +} +``` + +#### 2. Add Custom Spans + +📝 To add new spans based on methods, we can simply use the `@WithSpan` Java annotation. When a traced transaction invokes the annotated method, a new span will be created. Here’s how to do it: + +```java +// ... +import io.opentelemetry.instrumentation.annotations.WithSpan; + +@Service +public class PaymentService { + // ... + + @WithSpan("Payment processing method") + private void process(PaymentProcessingContext context) { + //... + } + + @WithSpan("Payment store method") + private void store(PaymentProcessingContext context) { + //... + } +``` + +📝 We can also provide additional information to the span, such as method parameters using the ``@SpanAttribute`` annotation: + +```java +// ... +import io.opentelemetry.instrumentation.annotations.SpanAttribute; + +@Service +public class PaymentService { + // ... + + @WithSpan("RivieraDev: Payment processing method") + private void process(@SpanAttribute("context") PaymentProcessingContext context) { // <-- HERE + // ... + } + + @WithSpan("RivieraDev: Payment store method") + private void store(@SpanAttribute("context") PaymentProcessingContext context) { // <-- HERE + // ... + } +``` + +This will provide the whole PaymentProcessingContext into the trace. + +#### 3. Build and redeploy + +🛠️ As we did before: + +```bash +docker compose build easypay-service +docker compose up -d easypay-service +``` + +#### 4. Test it! + +🛠️ Generate some payments: + +```bash +http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +``` + +👀 Go back to Grafana and try to find your new traces using what you've learned previously. Observe the spans you added. + +> aside negative +> +> It may take some time for `easypay-service` to be registered in the service discovery and be available from the API gateway. +> Similarly, your traces being ingested by Tempo might also take some time. Patience is key 😅 + ## Correlate Traces, Logs Let's go back to the Grafana explore dashboard. From 6d5ccbe38e987d9af5b4a124217b9320c6e690d9 Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 14:49:28 +0200 Subject: [PATCH 7/9] feat: typos --- docs/workshop.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index 4af08ab..ef8fea3 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -142,7 +142,6 @@ You then can open this project in either your local VS Code or directly in your [![Open in Gitpod](img/open-in-gitpod.svg)](https://gitpod.io/#github.com/worldline/observability-workshop.git) ## Environment Setup -Duration: 0:05:00 ### Open GitPod @@ -247,7 +246,6 @@ transfer-encoding: chunked ``` ## Logs -Duration: 0:30:00 ### Some functional issues One of our customers raised an issue: From 7059747dc3bf68f7f8588e255985d1eb6d11bc8d Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 15:40:26 +0200 Subject: [PATCH 8/9] feat: fix build issue --- .../com/worldline/easypay/payment/boundary/PaymentResource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java b/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java index f190010..4d84e1f 100644 --- a/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java +++ b/easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java @@ -50,7 +50,7 @@ public ResponseEntity> findAll() { @Operation(description = "Count all payments", summary = "Count payments") public ResponseEntity count() { // LOG.info("Request: get number of processed payments"); -// return ResponseEntity.ok(paymentService.count()); + return ResponseEntity.ok(paymentService.count()); } @GetMapping("{id}") From 297e270248063a4adbbd4f72fe6ba419875a393e Mon Sep 17 00:00:00 2001 From: Alexandre Touret Date: Tue, 2 Jul 2024 16:00:49 +0200 Subject: [PATCH 9/9] feat: still merging --- docs/workshop.md | 200 +---------------------------------------------- 1 file changed, 1 insertion(+), 199 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index 80ed759..34610e7 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -575,6 +575,7 @@ $ docker compose down $ docker compose up -d --build --remove-orphans ``` + > aside positive > > During this workshop, we will only obfuscate the card numbers in Loki. It will therefore be stored as is in the log files but obfuscated in Loki and by this way in the data exposed on Grafana. @@ -1205,33 +1206,6 @@ The Grafana Alloy collector will be used once again, tasked with receiving trace Lastly, we will use Grafana to examine and interpret these traces, allowing us to better understand and optimize our application's performance. -### Enable distributed tracing -Duration: 20 minutes - -In this section, we'll explore **distributed tracing**, the third pillar of application observability. - -Distributed tracing is an essential tool for monitoring and analyzing the performance of complex applications. It tracks the flow of requests across multiple services and components, helping to identify bottlenecks and improve efficiency — particularly useful for intricate systems like Easypay. - -With Spring Boot, there are a couple of approaches to incorporate distributed tracing into your application: -* Utilize the [Spring Boot Actuator integration](https://docs.spring.io/spring-boot/docs/current/reference/html/actuator.html#actuator.tracing) with support from [Micrometer Tracing](https://docs.micrometer.io/docs/tracing), -* Or adopt a broader [Java Agent approach](https://github.com/open-telemetry/opentelemetry-java-instrumentation) provided by the OpenTelemetry project, which automatically instruments our code when attached to our JVM. - -For this workshop, we'll use the Java Agent method and, with a focus on Grafana, we will employ their version of the [OpenTelemetry Java Agent](https://github.com/grafana/grafana-opentelemetry-java). - -The Grafana Alloy collector will be used once again, tasked with receiving traces and forwarding them to the Tempo backend. - -> aside positive -> -> Utilizing collectors offers several advantages for managing telemetry data: -> - Reduces the need for complicated application configurations: just send data to `localhost`, -> - Centralizes configuration to a single point: the collector, -> - Acts as a buffer to prevent resource overuse, -> - Can transform data before ingestion, -> - Supports data intake from various protocols and can relay them to any backend, -> - ... - -Lastly, we will use Grafana to examine and interpret these traces, allowing us to better understand and optimize our application's performance. - ### Enable distributed tracing To capture the entire transaction across all services in a trace, it's essential to instrument all the services in our application. @@ -1289,16 +1263,11 @@ otelcol.receiver.otlp "default" { endpoint = "0.0.0.0:4317" } -To capture the entire transaction across all services in a trace, it's essential to instrument all the services in our application. output { traces = [otelcol.processor.batch.default.input] } } -> aside positive -> -> In this workshop, our primary focus will be on the `easypay` service. -> For efficiency, we have already instrumented the other services beforehand. // BATCH PROCESSING FOR OPTIMIZATION (2) otelcol.processor.batch "default" { output { @@ -1306,158 +1275,7 @@ otelcol.processor.batch "default" { } } -#### Download Grafana Opentelemetry Java Agent // TRACE EXPORTING TO TEMPO (OTLP) (3) -otelcol.exporter.otlp "tempo" { - client { - endpoint = "tempo:4317" - -If you're using *GitPod*, the Java Agent should already be available in the `instrumentation/grafana-opentelemetry-java.jar` directory. - tls { - insecure = true - } - } -} -``` -1. Setting up the [``otelcol.receiver.otlp``](https://grafana.com/docs/alloy/latest/reference/components/otelcol.receiver.otlp/) receiver to accept telemetry data over the OTEL protocol via GRPC, listening on port `4317`, -2. Configuring the [processor](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.batch/) to batch traces efficiently, reducing resource usage, -3. Establishing the [``otelcol.exporter.otlp``](https://grafana.com/docs/alloy/latest/reference/components/otelcol.exporter.otlp/) exporter to send collected telemetry data to the Grafana Tempo service. - -🛠️ If you are participating in this workshop on your workstation, or if the file is missing, you can run the following script to download it: -ℹ️ The Grafana OpenTelemetry Java Agent is pre-configured to transmit telemetry data directly to the collector. This setup is facilitated through environment variables specified in the `compose.yml` file: - -```yaml -services: - # ... - easypay-service: - # .. - environment: - # ... - OTEL_SERVICE_NAME: easypay-service (1) - OTEL_EXPORTER_OTLP_ENDPOINT: http://collector:4317 (2) - OTEL_EXPORTER_OTLP_PROTOCOL: grpc (3) - # ... -``` -1. `OTEL_SERVICE_NAME` defines a service name which will be attached to traces to identify the instrumented service, -2. `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable configures where the telemetry data should be sent, -3. `OTEL_EXPORTER_OTLP_PROTOCOL` sets the OTLP protocol used behind, here GRPC (can be HTTP). - -> aside positive -> -> Find more information about how to configure the OpenTelemetry Java Agent in [its official documentation](https://opentelemetry.io/docs/languages/java/configuration/). - -#### Enable Java Agent - -📝 Since we are deploying the easypay-service using *Docker*, we need to modify the last lines of the `easypay-service/src/main/docker/Dockerfile`: - -```Dockerfile -# ... -USER javauser -🛠️ To apply the new settings, restart Grafana Alloy with the following command: - -# Copy Java Agent into the container -COPY instrumentation/grafana-opentelemetry-java.jar /app/grafana-opentelemetry-java.jar -```bash -docker compose restart collector -``` - -# Add the -javagent flag to setup the JVM to start with our Java Agent -ENTRYPOINT ["java", "-javaagent:/app/grafana-opentelemetry-java.jar", "-cp","app:app/lib/*","com.worldline.easypay.EasypayServiceApplication"] # (2) -``` -✅ After restarting, verify that Grafana Alloy is up and running with the updated configuration by accessing the Alloy dashboard on port ``12345``. - -The ENTRYPOINT instruction specifies the default command that will be executed when the container starts. -🛠️ Redeploy the updated ``easypay-service``: - -🛠️ You can now build the updated easypay-service container image: -```bash -docker compose up -d easypay-service -``` - -```bash -docker compose build easypay-service -``` -✅ To ensure easypay-service has started up correctly, check its logs with: - -#### Configure Grafana Alloy -```bash -docker compose logs -f easypay-service -``` - -It's time to set up *Grafana Alloy* for handling telemetry data. We will configure it to accept traces through the OpenTelemetry GRPC protocol (OTLP) on port `4317`, and then forward them to *Grafana Tempo*, which listens on the host `tempo` on the same port `4317` (this setup specifically handles OTLP traces). -#### Explore Traces with Grafana - -📝 Please add the following configuration to the `docker/alloy/config.alloy` file: -> aside positive -> -> For this workshop, we've already configured the Tempo datasource in Grafana. -> You can take a look at its configuration in Grafana (available on port ``3000``) by navigating to the `Connections` > `Data sources` section. -> Similar to Prometheus, the configuration is quite straightforward as we only need to set up the Tempo server URL. - -🛠️ Generate some load on the application to produce traces: - -```bash -k6 run -u 1 -d 5m k6/01-payment-only.js -``` - -🛠️ Let’s explore your first traces in Grafana: -* Go to Grafana and open an ``Explore`` dashboard, -* Select the `Tempo` data source and click on ``Run query`` to refresh the view. - -> aside negative -> -> You may need to wait one or two minutes to allow Tempo to ingest some traces… - -👀 Click on `Service Graph` and explore the `Node graph`: this view is extremely helpful for visualizing and understanding how our services communicate with each other. - -👀 Go back to `Search` and click on `Run query`. You should see a table named `Table - Traces`. -By default, this view provides the most recent traces available in *Tempo*. - -🛠️ Let's find an interesting trace using the query builder: -* Look at all traces corresponding to a POST to `easypay-service` with a duration greater than 50 ms: - * Span Name: `POST easypay-service` - * Duration: `trace` `>` `50ms` - * You can review the generated query, which uses a syntax called TraceQL. -* Click on `Run query`. -* Sort the table by `Duration` (click on the column name) to find the slowest trace. -* Drill down a `Trace ID`. - - -You should see the full stack of the corresponding transaction. - -👀 Grafana should open a new view (you can enlarge it by clicking on the three vertical dots and selecting `Widen pane`): -* Pinpoint the different nodes and their corresponding response times: - * Each line is a span and corresponds to the time spent in a method/event. -* Examine the SQL queries and their response times. -* Discover that distributed tracing can link transactions through: - * HTTP (`api-gateway` to `easypay-service` and `easypay-service` to `smartbank-gateway`). - * Kafka (`easypay-service` to `fraudetect-service` and `merchant-backoffice`). -* Click on `Node graph` to get a graphical view of all the spans participating in the trace. - -🛠️ Continue your exploration in the `Search` pane: -* For example, you can add the `Status` `=` `error` filter to see only traces that contain errors. -```terraform -// ... - -// RECEIVER SETUP (OTLP GRPC) (1) -otelcol.receiver.otlp "default" { - grpc { - endpoint = "0.0.0.0:4317" - } - - output { - traces = [otelcol.processor.batch.default.input] - } -} - -// BATCH PROCESSING FOR OPTIMIZATION (2) -otelcol.processor.batch "default" { - output { - traces = [otelcol.exporter.otlp.tempo.input] - } -} - -// TRACE EXPORTING TO TEMPO (OTLP) (3) otelcol.exporter.otlp "tempo" { client { endpoint = "tempo:4317" @@ -1577,7 +1395,6 @@ In this workshop, we will implement Tail Sampling. Modify the Alloy configuration file (``docker/alloy/config.alloy``) as follows: In the alloy configuration file (``docker/alloy/config.alloy``), uncomment this configuration just after the ``SAMPLING`` comment: -Modify the Alloy configuration file (``docker/alloy/config.alloy``) as follows: ``` // ... // RECEIVER (OTLP) @@ -1591,23 +1408,8 @@ otelcol.receiver.otlp "default" { } } -// TAIL SAMPLING (2) -// SAMPLING -// ... -// RECEIVER (OTLP) -otelcol.receiver.otlp "default" { - grpc { - endpoint = "0.0.0.0:4317" - } - - output { - traces = [otelcol.processor.tail_sampling.actuator.input] // (1) - } -} - // TAIL SAMPLING (2) otelcol.processor.tail_sampling "actuator" { - // Filter on http.url attribute (3) policy { name = "filter_http_url"