diff --git a/.changesets/maint_bryn_otel_update.md b/.changesets/maint_bryn_otel_update.md new file mode 100644 index 0000000000..f2735b645a --- /dev/null +++ b/.changesets/maint_bryn_otel_update.md @@ -0,0 +1,20 @@ +### Update to OpenTelemetry 0.20.0 ([PR #3649](https://github.com/apollographql/router/pull/3649)) + +The router now uses OpenTelemetry 0.20.0. This includes a number of fixes and improvements from upstream. + +In particular metrics have some significant changes: +* Prometheus metrics are now aligned with the [OpenTelemetry spec](https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/), and will not report `service_name` on each individual metric. Resource attributes are now moved to a single `target_info` metric. + + Users should check that their dashboards and alerts are properly configured when upgrading. + +* The default service name for metrics is now `unknown_service` as per the [OpenTelemetry spec](https://opentelemetry.io/docs/concepts/sdk-configuration/general-sdk-configuration/#otel_service_name). + + Users should ensure to configure service name via router.yaml, or via the `OTEL_SERVICE_NAME` environment variable. + +* The order of priority for setting service name has been brought into line with the rest of the router configuration. The order of priority is now: + 1. `OTEL_RESOURCE_ATTRIBUTES` environment variable + 2. `OTEL_SERVICE_NAME` environment variable + 3. `resource_attributes` in router.yaml + 4. `service_name` in router.yaml + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/3649 diff --git a/Cargo.lock b/Cargo.lock index 692b09baae..7977cd5d21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -331,6 +331,7 @@ dependencies = [ "multimap 0.9.0", "notify", "nu-ansi-term 0.49.0", + "num-traits", "once_cell", "opentelemetry", "opentelemetry-aws", @@ -340,6 +341,7 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry-prometheus", "opentelemetry-semantic-conventions", + "opentelemetry-stdout", "opentelemetry-zipkin", "opentelemetry_api", "p256 0.12.0", @@ -359,7 +361,7 @@ dependencies = [ "router-bridge", "rstack", "rust-embed", - "rustls 0.21.7", + "rustls", "rustls-pemfile", "schemars", "serde", @@ -380,12 +382,12 @@ dependencies = [ "thiserror", "tikv-jemallocator", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", "tokio-stream", "tokio-tungstenite", "tokio-util", "toml 0.7.6", - "tonic 0.8.3", + "tonic", "tonic-build", "tower", "tower-http", @@ -558,7 +560,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" dependencies = [ "concurrent-queue", - "event-listener", + "event-listener 2.5.3", "futures-core", ] @@ -576,6 +578,127 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-executor" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78f2db9467baa66a700abce2a18c5ad793f6f83310aca1284796fc3921d113fd" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand 2.0.0", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix 0.37.23", + "slab", + "socket2 0.4.9", + "waker-fn", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener 2.5.3", +] + +[[package]] +name = "async-process" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf012553ce51eb7aa6dc2143804cc8252bd1cb681a1c5cb7fa94ca88682dee1d" +dependencies = [ + "async-io", + "async-lock", + "async-signal", + "blocking", + "cfg-if", + "event-listener 3.0.0", + "futures-lite", + "rustix 0.38.8", + "windows-sys 0.48.0", +] + +[[package]] +name = "async-signal" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4af361a844928cb7d36590d406709473a1b574f443094422ef166daa3b493208" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "concurrent-queue", + "futures-core", + "futures-io", + "libc", + "signal-hook-registry", + "slab", + "windows-sys 0.48.0", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "async-process", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -598,6 +721,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "async-task" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9441c6b2fe128a7c2bf680a44c34d0df31ce09e5b7e401fcca3faa483dbc921" + [[package]] name = "async-trait" version = "0.1.73" @@ -609,6 +738,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "atty" version = "0.2.14" @@ -818,7 +953,7 @@ dependencies = [ "hyper-rustls", "lazy_static", "pin-project-lite", - "rustls 0.21.7", + "rustls", "tokio", "tower", "tracing", @@ -1111,6 +1246,22 @@ dependencies = [ "generic-array 0.14.7", ] +[[package]] +name = "blocking" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94c4ef1f913d78636d78d538eec1f18de81e481f44b1be0a81060090530846e1" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "fastrand 2.0.0", + "futures-io", + "futures-lite", + "piper", + "tracing", +] + [[package]] name = "bloomfilter" version = "1.0.12" @@ -1509,7 +1660,7 @@ checksum = "c2895653b4d9f1538a83970077cb01dfc77a4810524e51a110944688e916b18e" dependencies = [ "prost", "prost-types", - "tonic 0.9.2", + "tonic", "tracing-core", ] @@ -1531,7 +1682,7 @@ dependencies = [ "thread_local", "tokio", "tokio-stream", - "tonic 0.9.2", + "tonic", "tracing", "tracing-core", "tracing-subscriber", @@ -2360,6 +2511,17 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "event-listener" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29e56284f00d94c1bc7fd3c77027b4623c88c1f53d8d2394c6199f2921dea325" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + [[package]] name = "external-subgraph" version = "0.1.0" @@ -2561,13 +2723,13 @@ dependencies = [ "parking_lot 0.12.1", "rand 0.8.5", "redis-protocol", - "rustls 0.21.7", + "rustls", "rustls-native-certs", "rustls-webpki", "semver 1.0.18", "sha-1", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", "tokio-stream", "tokio-util", "url", @@ -2818,6 +2980,18 @@ dependencies = [ "regex", ] +[[package]] +name = "gloo-timers" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "graphql-introspection-query" version = "0.2.0" @@ -3179,10 +3353,10 @@ dependencies = [ "http", "hyper", "log", - "rustls 0.21.7", + "rustls", "rustls-native-certs", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", ] [[package]] @@ -3323,6 +3497,17 @@ dependencies = [ "ghost", ] +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.2", + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "ipnet" version = "2.8.0" @@ -3336,7 +3521,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix", + "rustix 0.38.8", "windows-sys 0.48.0", ] @@ -3488,6 +3673,15 @@ dependencies = [ "libc", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy-regex" version = "2.5.0" @@ -3622,6 +3816,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + [[package]] name = "linux-raw-sys" version = "0.4.5" @@ -3643,6 +3843,9 @@ name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +dependencies = [ + "value-bag", +] [[package]] name = "lru" @@ -4071,9 +4274,9 @@ dependencies = [ [[package]] name = "opentelemetry" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f" +checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" dependencies = [ "opentelemetry_api", "opentelemetry_sdk", @@ -4081,21 +4284,20 @@ dependencies = [ [[package]] name = "opentelemetry-aws" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a394d24777936802edd6c03a68daab4db39630418c7e431a5648e9befa80b8" +checksum = "31120a0109c172a42096766ef10e772f4a89422932be2c3b7f335858ff49380d" dependencies = [ "once_cell", - "opentelemetry", + "opentelemetry_api", ] [[package]] name = "opentelemetry-datadog" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf08569fbddd2149b268e2bde2bca0bab84bc19ee2efcc234f855f49a911536" +checksum = "b5f4ecf595095d3b641dd2761a0c3d1f175d3d6c28f38e65418d8004ea3255dd" dependencies = [ - "async-trait", "futures-core", "http", "indexmap 1.9.3", @@ -4112,9 +4314,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906" +checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", @@ -4125,83 +4327,97 @@ dependencies = [ [[package]] name = "opentelemetry-jaeger" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08e028dc9f4f304e9320ce38c80e7cf74067415b1ad5a8750a38bae54a4d450d" +checksum = "876958ba9084f390f913fcf04ddf7bbbb822898867bb0a51cc28f2b9e5c1b515" dependencies = [ "async-trait", - "futures", - "futures-executor", + "futures-core", + "futures-util", "headers", "http", - "once_cell", "opentelemetry", "opentelemetry-http", "opentelemetry-semantic-conventions", "reqwest", - "thiserror", "thrift", "tokio", ] [[package]] name = "opentelemetry-otlp" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca" +checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", - "futures", - "futures-util", + "futures-core", "http", - "opentelemetry", "opentelemetry-http", "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", "prost", "reqwest", "thiserror", "tokio", - "tonic 0.8.3", + "tonic", ] [[package]] name = "opentelemetry-prometheus" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a9f186f6293ebb693caddd0595e66b74a6068fa51048e26e0bf9c95478c639c" +checksum = "c7d81bc254e2d572120363a2b16cdb0d715d301b5789be0cfc26ad87e4e10e53" dependencies = [ - "opentelemetry", + "once_cell", + "opentelemetry_api", + "opentelemetry_sdk", "prometheus", "protobuf", ] [[package]] name = "opentelemetry-proto" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c" +checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" dependencies = [ - "futures", - "futures-util", - "opentelemetry", + "opentelemetry_api", + "opentelemetry_sdk", "prost", - "tonic 0.8.3", + "tonic", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5" +checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" dependencies = [ "opentelemetry", ] +[[package]] +name = "opentelemetry-stdout" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bd550321bc0f9d3f6dcbfe5c75262789de5b3e2776da2cbcfd2392aa05db0c6" +dependencies = [ + "futures-util", + "opentelemetry_api", + "opentelemetry_sdk", + "ordered-float 3.9.0", + "serde", + "serde_json", +] + [[package]] name = "opentelemetry-zipkin" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fd48caee5e1db71454c95be32d1daeb6fae265321ff8f51b1efc8a50b0be80" +checksum = "eb966f01235207a6933c0aec98374fe9782df1c1d2b3d1db35c458451d138143" dependencies = [ "async-trait", "futures-core", @@ -4219,14 +4435,14 @@ dependencies = [ [[package]] name = "opentelemetry_api" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2" +checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" dependencies = [ - "fnv", "futures-channel", "futures-util", "indexmap 1.9.3", + "js-sys", "once_cell", "pin-project-lite", "thiserror", @@ -4235,21 +4451,23 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1" +checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" dependencies = [ + "async-std", "async-trait", "crossbeam-channel", - "dashmap", - "fnv", "futures-channel", "futures-executor", "futures-util", "once_cell", "opentelemetry_api", + "ordered-float 3.9.0", "percent-encoding", "rand 0.8.5", + "regex", + "serde_json", "thiserror", "tokio", "tokio-stream", @@ -4497,6 +4715,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "668d31b1c4eba19242f2088b2bf3316b82ca31082a8335764db4e083db7485d4" +dependencies = [ + "atomic-waker", + "fastrand 2.0.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.4.1" @@ -4570,6 +4799,22 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags 1.3.2", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys 0.48.0", +] + [[package]] name = "polyval" version = "0.6.1" @@ -5022,14 +5267,14 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.7", + "rustls", "rustls-native-certs", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", "tokio-util", "tower-service", "url", @@ -5323,27 +5568,29 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.8" +version = "0.37.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" +checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" dependencies = [ - "bitflags 2.4.0", + "bitflags 1.3.2", "errno", + "io-lifetimes", "libc", - "linux-raw-sys", + "linux-raw-sys 0.3.8", "windows-sys 0.48.0", ] [[package]] -name = "rustls" -version = "0.20.8" +name = "rustix" +version = "0.38.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" dependencies = [ - "log", - "ring", - "sct", - "webpki", + "bitflags 2.4.0", + "errno", + "libc", + "linux-raw-sys 0.4.5", + "windows-sys 0.48.0", ] [[package]] @@ -6047,7 +6294,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.0", "redox_syscall 0.3.5", - "rustix", + "rustix 0.38.8", "windows-sys 0.48.0", ] @@ -6347,24 +6594,13 @@ dependencies = [ "syn 2.0.29", ] -[[package]] -name = "tokio-rustls" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" -dependencies = [ - "rustls 0.20.8", - "tokio", - "webpki", -] - [[package]] name = "tokio-rustls" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.7", + "rustls", "tokio", ] @@ -6401,10 +6637,10 @@ checksum = "212d5dcb2a1ce06d81107c3d0ffa3121fe974b73f068c8282cb1c32328113b6c" dependencies = [ "futures-util", "log", - "rustls 0.21.7", + "rustls", "rustls-native-certs", "tokio", - "tokio-rustls 0.24.1", + "tokio-rustls", "tungstenite", ] @@ -6468,14 +6704,14 @@ dependencies = [ [[package]] name = "tonic" -version = "0.8.3" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" +checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-stream", "async-trait", "axum", - "base64 0.13.1", + "base64 0.21.4", "bytes", "flate2", "futures-core", @@ -6488,41 +6724,10 @@ dependencies = [ "percent-encoding", "pin-project", "prost", - "prost-derive", "rustls-native-certs", "rustls-pemfile", "tokio", - "tokio-rustls 0.23.4", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" -dependencies = [ - "async-trait", - "axum", - "base64 0.21.4", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "tokio", + "tokio-rustls", "tokio-stream", "tower", "tower-layer", @@ -6671,12 +6876,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.19.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" dependencies = [ "once_cell", "opentelemetry", + "opentelemetry_sdk", + "smallvec", "tracing", "tracing-core", "tracing-log", @@ -6776,7 +6983,7 @@ dependencies = [ "httparse", "log", "rand 0.8.5", - "rustls 0.21.7", + "rustls", "sha1 0.10.6", "thiserror", "url", @@ -7025,6 +7232,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "value-bag" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" + [[package]] name = "vcpkg" version = "0.2.15" @@ -7181,16 +7394,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "webpki-roots" version = "0.25.2" diff --git a/apollo-router/Cargo.toml b/apollo-router/Cargo.toml index c22dca60ea..c0594164a5 100644 --- a/apollo-router/Cargo.toml +++ b/apollo-router/Cargo.toml @@ -133,30 +133,35 @@ once_cell = "1.18.0" # groups `^tracing` and `^opentelemetry*` dependencies together as of # https://github.com/apollographql/router/pull/1509. A comment which exists # there (and on `tracing` packages below) should be updated should this change. -opentelemetry = { version = "0.19.0", features = ["rt-tokio", "metrics"] } -opentelemetry_api = "0.19.0" -opentelemetry-aws = "0.7.0" -opentelemetry-datadog = { version = "0.7.0", features = ["reqwest-client"] } -opentelemetry-http = "0.8.0" -opentelemetry-jaeger = { version = "0.18.0", features = [ +opentelemetry = { version = "0.20.0", features = [ + "rt-tokio", + "metrics", + "testing" +] } +opentelemetry_api = "0.20.0" +opentelemetry-aws = "0.8.0" +opentelemetry-datadog = { version = "0.8.0", features = ["reqwest-client"] } +opentelemetry-http = "0.9.0" +opentelemetry-jaeger = { version = "0.19.0", features = [ "collector_client", "reqwest_collector_client", "rt-tokio", ] } -opentelemetry-otlp = { version = "0.12.0", default-features = false, features = [ +opentelemetry-otlp = { version = "0.13.0", default-features = false, features = [ "grpc-tonic", + "gzip-tonic", "tonic", "tls", "http-proto", "metrics", "reqwest-client", ] } -opentelemetry-semantic-conventions = "0.11.0" -opentelemetry-zipkin = { version = "0.17.0", default-features = false, features = [ +opentelemetry-semantic-conventions = "0.12.0" +opentelemetry-zipkin = { version = "0.18.0", default-features = false, features = [ "reqwest-client", "reqwest-rustls", ] } -opentelemetry-prometheus = "0.12.0" +opentelemetry-prometheus = "0.13.0" paste = "1.0.14" pin-project-lite = "0.2.13" prometheus = "0.13" @@ -196,11 +201,11 @@ thiserror = "1.0.48" tokio = { version = "1.32.0", features = ["full"] } tokio-stream = { version = "0.1.14", features = ["sync", "net"] } tokio-util = { version = "0.7.9", features = ["net", "codec", "time"] } -tonic = { version = "0.8.3", features = [ +tonic = { version = "0.9.2", features = [ "transport", "tls", "tls-roots", - "gzip", + "gzip" ] } tower = { version = "0.4.13", features = ["full"] } tower-http = { version = "0.4.4", features = [ @@ -219,7 +224,7 @@ tower-service = "0.3.2" tracing = "0.1.37" tracing-core = "0.1.31" tracing-futures = { version = "0.2.5", features = ["futures-03"] } -tracing-opentelemetry = "0.19.0" +tracing-opentelemetry = "0.21.0" tracing-subscriber = { version = "0.3.17", features = ["env-filter", "json"] } url = { version = "2.4.1", features = ["serde"] } urlencoding = "2.1.3" @@ -266,7 +271,9 @@ insta = { version = "1.32.0", features = ["json", "redactions", "yaml"] } maplit = "1.0.2" memchr = { version = "2.6.3", default-features = false } mockall = "0.11.4" +num-traits = "0.2.16" once_cell = "1.18.0" +opentelemetry-stdout = { version = "0.1.0", features = ["trace"] } p256 = "0.12.0" rand_core = "0.6.4" redis = { version = "0.21.7", features = ["tokio-comp"] } diff --git a/apollo-router/src/executable.rs b/apollo-router/src/executable.rs index 8c1f669ed9..a340f480af 100644 --- a/apollo-router/src/executable.rs +++ b/apollo-router/src/executable.rs @@ -28,6 +28,7 @@ use url::Url; use crate::configuration::generate_config_schema; use crate::configuration::generate_upgrade; use crate::configuration::Discussed; +use crate::metrics::meter_provider; use crate::plugin::plugins; use crate::plugins::telemetry::reload::init_telemetry; use crate::router::ConfigurationSource; @@ -468,8 +469,12 @@ impl Executable { None => Self::inner_start(shutdown, schema, config, license, opt).await, }; - //We should be good to shutdown the tracer provider now as the router should have finished everything. - opentelemetry::global::shutdown_tracer_provider(); + // We should be good to shutdown OpenTelemetry now as the router should have finished everything. + tokio::task::spawn_blocking(move || { + opentelemetry::global::shutdown_tracer_provider(); + meter_provider().shutdown(); + }) + .await?; result } diff --git a/apollo-router/src/lib.rs b/apollo-router/src/lib.rs index 9b0f04db47..fb5befb06e 100644 --- a/apollo-router/src/lib.rs +++ b/apollo-router/src/lib.rs @@ -47,6 +47,9 @@ mod json_ext; #[macro_use] pub mod plugin; +#[macro_use] +pub(crate) mod metrics; + pub(crate) mod axum_factory; mod cache; mod configuration; diff --git a/apollo-router/src/metrics/aggregation.rs b/apollo-router/src/metrics/aggregation.rs new file mode 100644 index 0000000000..261e6765d7 --- /dev/null +++ b/apollo-router/src/metrics/aggregation.rs @@ -0,0 +1,451 @@ +use std::any::Any; +use std::borrow::Cow; +use std::collections::HashMap; +use std::mem; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::Mutex; + +use derive_more::From; +use itertools::Itertools; +use opentelemetry::metrics::Callback; +use opentelemetry::metrics::Counter; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::InstrumentProvider; +use opentelemetry::metrics::Meter; +use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::ObservableCounter; +use opentelemetry::metrics::ObservableGauge; +use opentelemetry::metrics::ObservableUpDownCounter; +use opentelemetry::metrics::SyncCounter; +use opentelemetry::metrics::SyncHistogram; +use opentelemetry::metrics::SyncUpDownCounter; +use opentelemetry::metrics::Unit; +use opentelemetry::metrics::UpDownCounter; +use opentelemetry::KeyValue; +use opentelemetry_api::metrics::AsyncInstrument; +use opentelemetry_api::metrics::CallbackRegistration; +use opentelemetry_api::metrics::MetricsError; +use opentelemetry_api::metrics::Observer; + +use crate::metrics::filter::FilterMeterProvider; + +// This meter provider enables us to combine multiple meter providers. The reasons we need this are: +// 1. Prometheus meters are special. To dispose a meter is to dispose the entire registry. This means we need to make a best effort to keep them around. +// 2. To implement filtering we use a view. However this must be set during build of the meter provider, thus we need separate ones for Apollo and general metrics. +// Unlike the regular meter provider this implementation will return an existing meter if one has been created already rather than a new one. +// This is within the spec: https://opentelemetry.io/docs/specs/otel/metrics/api/#get-a-meter +// `Meters are identified by name, version, and schema_url fields. When more than one Meter of the same name, version, and schema_url is created, it is unspecified whether or under which conditions the same or different Meter instances are returned. It is a user error to create Meters with different attributes but the same identity.` + +#[derive(Hash, Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Debug)] +pub(crate) enum MeterProviderType { + PublicPrometheus, + Apollo, + Public, +} + +#[derive(Clone, Default)] +pub(crate) struct AggregateMeterProvider { + inner: Arc>, +} + +#[derive(Default)] +pub(crate) struct Inner { + providers: HashMap)>, + registered_instruments: Vec, +} + +#[derive(From)] +pub(crate) enum InstrumentWrapper { + U64Counter(Arc>), + F64Counter(Arc>), + I64UpDownCounter(Arc>), + F64UpDownCounter(Arc>), + I64Histogram(Arc>), + U64Histogram(Arc>), + F64Histogram(Arc>), +} + +#[derive(Eq, PartialEq, Hash)] +struct MeterId { + name: Cow<'static, str>, + version: Option>, + schema_url: Option>, + // Note that attributes are not part of the meter ID. +} + +impl AggregateMeterProvider { + /// The behaviour of this function is that if None is passed in, the meter will be left as is. + /// To disable meter_providers use a noop meter provider. + /// The old meter_provider if any is returned, and it is up to the caller to clean up. + /// Any registered instruments must be invalidated so that they are fetched again. + pub(crate) fn set( + &self, + meter_provider_type: MeterProviderType, + meter_provider: Option, + ) -> Option { + let mut inner = self.inner.lock().expect("lock poisoned"); + // As we are changing a meter provider we need to invalidate any registered instruments. + // Clearing these allows any weak references at callsites to be invalidated. + inner.registered_instruments.clear(); + + //Now update the meter provider + if let Some(meter_provider) = meter_provider { + inner + .providers + .insert( + meter_provider_type, + (meter_provider.clone(), HashMap::new()), + ) + .map(|(old_provider, _)| old_provider) + } else { + None + } + } + + /// Shutdown MUST be called from a blocking thread. + pub(crate) fn shutdown(&self) { + let inner = self.inner.lock().expect("lock poisoned"); + for (meter_provider_type, (meter_provider, _)) in &inner.providers { + if let Err(e) = meter_provider.shutdown() { + ::tracing::error!(error = %e, meter_provider_type = ?meter_provider_type, "failed to shutdown meter provider") + } + } + } + + /// Create a registered instrument. This enables caching at callsites and invalidation at the meter provider via weak reference. + #[allow(dead_code)] + pub(crate) fn create_registered_instrument( + &self, + create_fn: impl Fn(&mut Inner) -> T, + ) -> Arc + where + Arc: Into, + { + let mut guard = self.inner.lock().expect("lock poisoned"); + let instrument = Arc::new((create_fn)(guard.deref_mut())); + guard.registered_instruments.push(instrument.clone().into()); + instrument + } + + #[cfg(test)] + pub(crate) fn registered_instruments(&self) -> usize { + self.inner + .lock() + .expect("lock poisoned") + .registered_instruments + .len() + } +} + +impl Inner { + pub(crate) fn meter(&mut self, name: impl Into>) -> Meter { + self.versioned_meter( + name, + None::>, + None::>, + None, + ) + } + pub(crate) fn versioned_meter( + &mut self, + name: impl Into>, + version: Option>>, + schema_url: Option>>, + attributes: Option>, + ) -> Meter { + let name = name.into(); + let version = version.map(|v| v.into()); + let schema_url = schema_url.map(|v| v.into()); + let mut meters = Vec::with_capacity(self.providers.len()); + + for (provider, existing_meters) in self.providers.values_mut() { + meters.push( + existing_meters + .entry(MeterId { + name: name.clone(), + version: version.clone(), + schema_url: schema_url.clone(), + }) + .or_insert_with(|| { + provider.versioned_meter( + name.clone(), + version.clone(), + schema_url.clone(), + attributes.clone(), + ) + }) + .clone(), + ); + } + + Meter::new(Arc::new(AggregateInstrumentProvider { meters })) + } +} + +impl MeterProvider for AggregateMeterProvider { + fn versioned_meter( + &self, + name: impl Into>, + version: Option>>, + schema_url: Option>>, + attributes: Option>, + ) -> Meter { + let mut inner = self.inner.lock().expect("lock poisoned"); + inner.versioned_meter(name, version, schema_url, attributes) + } +} + +pub(crate) struct AggregateInstrumentProvider { + meters: Vec, +} + +pub(crate) struct AggregateCounter { + delegates: Vec>, +} + +impl SyncCounter for AggregateCounter { + fn add(&self, value: T, attributes: &[KeyValue]) { + for counter in &self.delegates { + counter.add(value, attributes) + } + } +} + +pub(crate) struct AggregateObservableCounter { + delegates: Vec>, +} + +impl AsyncInstrument for AggregateObservableCounter { + fn observe(&self, value: T, attributes: &[KeyValue]) { + for counter in &self.delegates { + counter.observe(value, attributes) + } + } + + fn as_any(&self) -> Arc { + unreachable!() + } +} + +pub(crate) struct AggregateHistogram { + delegates: Vec>, +} + +impl SyncHistogram for AggregateHistogram { + fn record(&self, value: T, attributes: &[KeyValue]) { + for histogram in &self.delegates { + histogram.record(value, attributes) + } + } +} + +pub(crate) struct AggregateUpDownCounter { + delegates: Vec>, +} + +impl SyncUpDownCounter for AggregateUpDownCounter { + fn add(&self, value: T, attributes: &[KeyValue]) { + for counter in &self.delegates { + counter.add(value, attributes) + } + } +} + +pub(crate) struct AggregateObservableUpDownCounter { + delegates: Vec>, +} + +impl AsyncInstrument for AggregateObservableUpDownCounter { + fn observe(&self, value: T, attributes: &[KeyValue]) { + for counter in &self.delegates { + counter.observe(value, attributes) + } + } + + fn as_any(&self) -> Arc { + unreachable!() + } +} + +pub(crate) struct AggregateObservableGauge { + delegates: Vec>, +} + +impl AsyncInstrument for AggregateObservableGauge { + fn observe(&self, measurement: T, attributes: &[KeyValue]) { + for gauge in &self.delegates { + gauge.observe(measurement, attributes) + } + } + + fn as_any(&self) -> Arc { + unreachable!() + } +} +macro_rules! aggregate_instrument_fn { + ($name:ident, $ty:ty, $wrapper:ident, $implementation:ident) => { + fn $name( + &self, + name: Cow<'static, str>, + description: Option>, + unit: Option, + ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { + let delegates = self + .meters + .iter() + .map(|p| { + let mut b = p.$name(name.clone()); + if let Some(description) = &description { + b = b.with_description(description.clone()); + } + if let Some(unit) = &unit { + b = b.with_unit(unit.clone()); + } + b.try_init() + }) + .try_collect()?; + Ok($wrapper::new(Arc::new($implementation { delegates }))) + } + }; +} + +// Observable instruments don't need to have a ton of optimisation because they are only read on demand. +macro_rules! aggregate_observable_instrument_fn { + ($name:ident, $ty:ty, $wrapper:ident, $implementation:ident) => { + fn $name( + &self, + name: Cow<'static, str>, + description: Option>, + unit: Option, + callback: Vec>, + ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { + let callback: Vec>> = + callback.into_iter().map(|c| Arc::new(c)).collect_vec(); + let delegates = self + .meters + .iter() + .map(|p| { + let mut b = p.$name(name.clone()); + if let Some(description) = &description { + b = b.with_description(description.clone()); + } + if let Some(unit) = &unit { + b = b.with_unit(unit.clone()); + } + for callback in &callback { + let callback = callback.clone(); + b = b.with_callback(move |c| (*callback)(c)); + } + b.try_init() + }) + .try_collect()?; + Ok($wrapper::new(Arc::new($implementation { delegates }))) + } + }; +} + +impl InstrumentProvider for AggregateInstrumentProvider { + aggregate_instrument_fn!(u64_counter, u64, Counter, AggregateCounter); + aggregate_instrument_fn!(f64_counter, f64, Counter, AggregateCounter); + + aggregate_observable_instrument_fn!( + f64_observable_counter, + f64, + ObservableCounter, + AggregateObservableCounter + ); + aggregate_observable_instrument_fn!( + u64_observable_counter, + u64, + ObservableCounter, + AggregateObservableCounter + ); + + aggregate_instrument_fn!(u64_histogram, u64, Histogram, AggregateHistogram); + aggregate_instrument_fn!(f64_histogram, f64, Histogram, AggregateHistogram); + aggregate_instrument_fn!(i64_histogram, i64, Histogram, AggregateHistogram); + + aggregate_instrument_fn!( + i64_up_down_counter, + i64, + UpDownCounter, + AggregateUpDownCounter + ); + aggregate_instrument_fn!( + f64_up_down_counter, + f64, + UpDownCounter, + AggregateUpDownCounter + ); + + aggregate_observable_instrument_fn!( + i64_observable_up_down_counter, + i64, + ObservableUpDownCounter, + AggregateObservableUpDownCounter + ); + aggregate_observable_instrument_fn!( + f64_observable_up_down_counter, + f64, + ObservableUpDownCounter, + AggregateObservableUpDownCounter + ); + + aggregate_observable_instrument_fn!( + f64_observable_gauge, + f64, + ObservableGauge, + AggregateObservableGauge + ); + aggregate_observable_instrument_fn!( + i64_observable_gauge, + i64, + ObservableGauge, + AggregateObservableGauge + ); + aggregate_observable_instrument_fn!( + u64_observable_gauge, + u64, + ObservableGauge, + AggregateObservableGauge + ); + + fn register_callback( + &self, + instruments: &[Arc], + callbacks: Box, + ) -> opentelemetry_api::metrics::Result> { + // The reason that this is OK is that calling observe outside of a callback is a no-op. + // So the callback is called, an observable is updated, but only the observable associated with the correct meter will take effect + + let callback = Arc::new(callbacks); + let mut callback_registrations = Vec::with_capacity(self.meters.len()); + for meter in &self.meters { + let callback = callback.clone(); + // If this fails there is no recovery as some callbacks may be registered + callback_registrations.push(meter.register_callback(instruments, move |c| callback(c))?) + } + Ok(Box::new(AggregatedCallbackRegistrations( + callback_registrations, + ))) + } +} + +struct AggregatedCallbackRegistrations(Vec>); +impl CallbackRegistration for AggregatedCallbackRegistrations { + fn unregister(&mut self) -> opentelemetry_api::metrics::Result<()> { + let mut errors = vec![]; + for mut registration in mem::take(&mut self.0) { + if let Err(err) = registration.unregister() { + errors.push(err); + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(MetricsError::Other(format!("{errors:?}"))) + } + } +} diff --git a/apollo-router/src/metrics/filter.rs b/apollo-router/src/metrics/filter.rs new file mode 100644 index 0000000000..c490d42025 --- /dev/null +++ b/apollo-router/src/metrics/filter.rs @@ -0,0 +1,333 @@ +use std::any::Any; +use std::borrow::Cow; +use std::sync::Arc; + +use buildstructor::buildstructor; +use opentelemetry::metrics::noop::NoopMeterProvider; +use opentelemetry::metrics::Callback; +use opentelemetry::metrics::Counter; +use opentelemetry::metrics::Histogram; +use opentelemetry::metrics::InstrumentProvider; +use opentelemetry::metrics::Meter; +use opentelemetry::metrics::MeterProvider; +use opentelemetry::metrics::ObservableCounter; +use opentelemetry::metrics::ObservableGauge; +use opentelemetry::metrics::ObservableUpDownCounter; +use opentelemetry::metrics::Unit; +use opentelemetry::metrics::UpDownCounter; +use opentelemetry_api::metrics::CallbackRegistration; +use opentelemetry_api::metrics::Observer; +use opentelemetry_api::Context; +use opentelemetry_api::KeyValue; +use regex::Regex; + +#[derive(Clone)] +pub(crate) struct FilterMeterProvider { + delegate: opentelemetry::sdk::metrics::MeterProvider, + deny: Option, + allow: Option, +} + +#[buildstructor] +impl FilterMeterProvider { + #[builder] + fn new( + delegate: opentelemetry::sdk::metrics::MeterProvider, + deny: Option, + allow: Option, + ) -> Self { + FilterMeterProvider { + delegate, + deny, + allow, + } + } + + pub(crate) fn private_metrics(delegate: opentelemetry::sdk::metrics::MeterProvider) -> Self { + FilterMeterProvider::builder() + .delegate(delegate) + .allow( + Regex::new( + r"apollo\.(graphos\.cloud|router\.(operations?|config|schema|query))(\..*|$)", + ) + .expect("regex should have been valid"), + ) + .build() + } + + pub(crate) fn public_metrics(delegate: opentelemetry::sdk::metrics::MeterProvider) -> Self { + FilterMeterProvider::builder() + .delegate(delegate) + .deny( + Regex::new(r"apollo\.router\.(config|entities)(\..*|$)") + .expect("regex should have been valid"), + ) + .build() + } + + pub(crate) fn shutdown(&self) -> opentelemetry::metrics::Result<()> { + self.delegate.shutdown() + } + + #[allow(dead_code)] + pub(crate) fn force_flush(&self, cx: &Context) -> opentelemetry::metrics::Result<()> { + self.delegate.force_flush(cx) + } +} + +struct FilteredInstrumentProvider { + delegate: Meter, + noop: Meter, + deny: Option, + allow: Option, +} + +macro_rules! filter_instrument_fn { + ($name:ident, $ty:ty, $wrapper:ident) => { + fn $name( + &self, + name: Cow<'static, str>, + description: Option>, + unit: Option, + ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { + let mut builder = match (&self.deny, &self.allow) { + (Some(deny), Some(allow)) if deny.is_match(&name) && !allow.is_match(&name) => { + self.noop.$name(name) + } + (Some(deny), None) if deny.is_match(&name) => self.noop.$name(name), + (None, Some(allow)) if !allow.is_match(&name) => self.noop.$name(name), + (_, _) => self.delegate.$name(name), + }; + if let Some(description) = &description { + builder = builder.with_description(description.clone()) + } + if let Some(unit) = &unit { + builder = builder.with_unit(unit.clone()); + } + builder.try_init() + } + }; +} + +macro_rules! filter_observable_instrument_fn { + ($name:ident, $ty:ty, $wrapper:ident) => { + fn $name( + &self, + name: Cow<'static, str>, + description: Option>, + unit: Option, + callback: Vec>, + ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { + let mut builder = match (&self.deny, &self.allow) { + (Some(deny), Some(allow)) if deny.is_match(&name) && !allow.is_match(&name) => { + self.noop.$name(name) + } + (Some(deny), None) if deny.is_match(&name) => self.noop.$name(name), + (None, Some(allow)) if !allow.is_match(&name) => self.noop.$name(name), + (_, _) => self.delegate.$name(name), + }; + if let Some(description) = &description { + builder = builder.with_description(description.clone()); + } + if let Some(unit) = &unit { + builder = builder.with_unit(unit.clone()); + } + + for callback in callback { + builder = builder.with_callback(callback); + } + + builder.try_init() + } + }; +} + +impl InstrumentProvider for FilteredInstrumentProvider { + filter_instrument_fn!(u64_counter, u64, Counter); + filter_instrument_fn!(f64_counter, f64, Counter); + + filter_observable_instrument_fn!(f64_observable_counter, f64, ObservableCounter); + filter_observable_instrument_fn!(u64_observable_counter, u64, ObservableCounter); + + filter_instrument_fn!(u64_histogram, u64, Histogram); + filter_instrument_fn!(f64_histogram, f64, Histogram); + filter_instrument_fn!(i64_histogram, i64, Histogram); + + filter_instrument_fn!(i64_up_down_counter, i64, UpDownCounter); + filter_instrument_fn!(f64_up_down_counter, f64, UpDownCounter); + + filter_observable_instrument_fn!(i64_observable_up_down_counter, i64, ObservableUpDownCounter); + filter_observable_instrument_fn!(f64_observable_up_down_counter, f64, ObservableUpDownCounter); + + filter_observable_instrument_fn!(f64_observable_gauge, f64, ObservableGauge); + filter_observable_instrument_fn!(i64_observable_gauge, i64, ObservableGauge); + filter_observable_instrument_fn!(u64_observable_gauge, u64, ObservableGauge); + + fn register_callback( + &self, + instruments: &[Arc], + callbacks: Box, + ) -> opentelemetry_api::metrics::Result> { + self.delegate.register_callback(instruments, callbacks) + } +} + +impl MeterProvider for FilterMeterProvider { + fn versioned_meter( + &self, + name: impl Into>, + version: Option>>, + schema_url: Option>>, + attributes: Option>, + ) -> Meter { + let delegate = self + .delegate + .versioned_meter(name, version, schema_url, attributes); + Meter::new(Arc::new(FilteredInstrumentProvider { + noop: NoopMeterProvider::default().meter(""), + delegate, + deny: self.deny.clone(), + allow: self.allow.clone(), + })) + } +} + +#[cfg(test)] +mod test { + + use opentelemetry::metrics::MeterProvider; + use opentelemetry::metrics::Unit; + use opentelemetry::runtime; + use opentelemetry::sdk::metrics::MeterProviderBuilder; + use opentelemetry::sdk::metrics::PeriodicReader; + use opentelemetry::testing::metrics::InMemoryMetricsExporter; + use opentelemetry_api::Context; + + use crate::metrics::filter::FilterMeterProvider; + + #[tokio::test(flavor = "multi_thread")] + async fn test_private_metrics() { + let exporter = InMemoryMetricsExporter::default(); + let meter_provider = FilterMeterProvider::private_metrics( + MeterProviderBuilder::default() + .with_reader(PeriodicReader::builder(exporter.clone(), runtime::Tokio).build()) + .build(), + ); + let cx = Context::default(); + let filtered = meter_provider.versioned_meter("filtered", "".into(), "".into(), None); + filtered + .u64_counter("apollo.router.operations") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.router.operations.test") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.graphos.cloud.test") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.router.unknown.test") + .init() + .add(1, &[]); + meter_provider.force_flush(&cx).unwrap(); + + let metrics: Vec<_> = exporter + .get_finished_metrics() + .unwrap() + .into_iter() + .flat_map(|m| m.scope_metrics.into_iter()) + .flat_map(|m| m.metrics) + .collect(); + assert!(metrics + .iter() + .any(|m| m.name == "apollo.router.operations.test")); + + assert!(metrics.iter().any(|m| m.name == "apollo.router.operations")); + + assert!(metrics + .iter() + .any(|m| m.name == "apollo.graphos.cloud.test")); + + assert!(!metrics + .iter() + .any(|m| m.name == "apollo.router.unknown.test")); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_public_metrics() { + let exporter = InMemoryMetricsExporter::default(); + let meter_provider = FilterMeterProvider::public_metrics( + MeterProviderBuilder::default() + .with_reader(PeriodicReader::builder(exporter.clone(), runtime::Tokio).build()) + .build(), + ); + let cx = Context::default(); + let filtered = meter_provider.versioned_meter("filtered", "".into(), "".into(), None); + filtered + .u64_counter("apollo.router.config") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.router.config.test") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.router.entities") + .init() + .add(1, &[]); + filtered + .u64_counter("apollo.router.entities.test") + .init() + .add(1, &[]); + meter_provider.force_flush(&cx).unwrap(); + + let metrics: Vec<_> = exporter + .get_finished_metrics() + .unwrap() + .into_iter() + .flat_map(|m| m.scope_metrics.into_iter()) + .flat_map(|m| m.metrics) + .collect(); + + assert!(!metrics.iter().any(|m| m.name == "apollo.router.config")); + assert!(!metrics + .iter() + .any(|m| m.name == "apollo.router.config.test")); + assert!(!metrics.iter().any(|m| m.name == "apollo.router.entities")); + assert!(!metrics + .iter() + .any(|m| m.name == "apollo.router.entities.test")); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_description_and_unit() { + let exporter = InMemoryMetricsExporter::default(); + let meter_provider = FilterMeterProvider::private_metrics( + MeterProviderBuilder::default() + .with_reader(PeriodicReader::builder(exporter.clone(), runtime::Tokio).build()) + .build(), + ); + let cx = Context::default(); + let filtered = meter_provider.versioned_meter("filtered", "".into(), "".into(), None); + filtered + .u64_counter("apollo.router.operations") + .with_description("desc") + .with_unit(Unit::new("ms")) + .init() + .add(1, &[]); + meter_provider.force_flush(&cx).unwrap(); + + let metrics: Vec<_> = exporter + .get_finished_metrics() + .unwrap() + .into_iter() + .flat_map(|m| m.scope_metrics.into_iter()) + .flat_map(|m| m.metrics) + .collect(); + assert!(metrics.iter().any(|m| m.name == "apollo.router.operations" + && m.description == "desc" + && m.unit == Unit::new("ms"))); + } +} diff --git a/apollo-router/src/plugins/telemetry/metrics/layer.rs b/apollo-router/src/metrics/layer.rs similarity index 91% rename from apollo-router/src/plugins/telemetry/metrics/layer.rs rename to apollo-router/src/metrics/layer.rs index ba773f418e..8502e1da0d 100644 --- a/apollo-router/src/plugins/telemetry/metrics/layer.rs +++ b/apollo-router/src/metrics/layer.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::sync::Arc; use std::sync::RwLock; use opentelemetry::metrics::Counter; @@ -8,7 +9,6 @@ use opentelemetry::metrics::Meter; use opentelemetry::metrics::MeterProvider; use opentelemetry::metrics::ObservableGauge; use opentelemetry::metrics::UpDownCounter; -use opentelemetry::Context as OtelContext; use opentelemetry::Key; use opentelemetry::KeyValue; use opentelemetry::Value; @@ -19,10 +19,12 @@ use tracing_subscriber::layer::Context; use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; -use super::METRIC_PREFIX_COUNTER; -use super::METRIC_PREFIX_HISTOGRAM; -use super::METRIC_PREFIX_MONOTONIC_COUNTER; -use super::METRIC_PREFIX_VALUE; +use crate::metrics::aggregation::AggregateMeterProvider; + +pub(crate) const METRIC_PREFIX_MONOTONIC_COUNTER: &str = "monotonic_counter."; +pub(crate) const METRIC_PREFIX_COUNTER: &str = "counter."; +pub(crate) const METRIC_PREFIX_HISTOGRAM: &str = "histogram."; +pub(crate) const METRIC_PREFIX_VALUE: &str = "value."; macro_rules! log_and_panic_in_debug_build { ($($tokens:tt)+) => {{ @@ -61,7 +63,6 @@ pub(crate) enum InstrumentType { impl Instruments { pub(crate) fn update_metric( &self, - cx: &OtelContext, meter: &Meter, instrument_type: InstrumentType, metric_name: &'static str, @@ -97,7 +98,7 @@ impl Instruments { &self.u64_counter, metric_name, || meter.u64_counter(metric_name).init(), - |ctr| ctr.add(cx, value, custom_attributes), + |ctr| ctr.add(value, custom_attributes), ); } InstrumentType::CounterF64(value) => { @@ -105,7 +106,7 @@ impl Instruments { &self.f64_counter, metric_name, || meter.f64_counter(metric_name).init(), - |ctr| ctr.add(cx, value, custom_attributes), + |ctr| ctr.add(value, custom_attributes), ); } InstrumentType::UpDownCounterI64(value) => { @@ -113,7 +114,7 @@ impl Instruments { &self.i64_up_down_counter, metric_name, || meter.i64_up_down_counter(metric_name).init(), - |ctr| ctr.add(cx, value, custom_attributes), + |ctr| ctr.add(value, custom_attributes), ); } InstrumentType::UpDownCounterF64(value) => { @@ -121,7 +122,7 @@ impl Instruments { &self.f64_up_down_counter, metric_name, || meter.f64_up_down_counter(metric_name).init(), - |ctr| ctr.add(cx, value, custom_attributes), + |ctr| ctr.add(value, custom_attributes), ); } InstrumentType::HistogramU64(value) => { @@ -129,7 +130,7 @@ impl Instruments { &self.u64_histogram, metric_name, || meter.u64_histogram(metric_name).init(), - |rec| rec.record(cx, value, custom_attributes), + |rec| rec.record(value, custom_attributes), ); } InstrumentType::HistogramI64(value) => { @@ -137,7 +138,7 @@ impl Instruments { &self.i64_histogram, metric_name, || meter.i64_histogram(metric_name).init(), - |rec| rec.record(cx, value, custom_attributes), + |rec| rec.record(value, custom_attributes), ); } InstrumentType::HistogramF64(value) => { @@ -145,7 +146,7 @@ impl Instruments { &self.f64_histogram, metric_name, || meter.f64_histogram(metric_name).init(), - |rec| rec.record(cx, value, custom_attributes), + |rec| rec.record(value, custom_attributes), ); } InstrumentType::GaugeU64(value) => { @@ -153,7 +154,7 @@ impl Instruments { &self.u64_gauge, metric_name, || meter.u64_observable_gauge(metric_name).init(), - |gauge| gauge.observe(cx, value, custom_attributes), + |gauge| gauge.observe(value, custom_attributes), ); } }; @@ -161,10 +162,10 @@ impl Instruments { } pub(crate) struct MetricVisitor<'a> { + pub(crate) meter: &'a Meter, pub(crate) instruments: &'a Instruments, pub(crate) metric: Option<(&'static str, InstrumentType)>, pub(crate) custom_attributes: Vec, - pub(crate) meter: &'a Meter, attributes_ignored: bool, } @@ -418,9 +419,7 @@ impl<'a> Visit for MetricVisitor<'a> { impl<'a> MetricVisitor<'a> { fn finish(self) { if let Some((metric_name, instrument_type)) = self.metric { - let cx = OtelContext::current(); self.instruments.update_metric( - &cx, self.meter, instrument_type, metric_name, @@ -430,18 +429,36 @@ impl<'a> MetricVisitor<'a> { } } +#[derive(Clone)] pub(crate) struct MetricsLayer { + meter_provider: AggregateMeterProvider, + inner: Arc>, +} + +struct MetricsLayerInner { meter: Meter, instruments: Instruments, } impl MetricsLayer { - pub(crate) fn new(meter_provider: &impl MeterProvider) -> Self { + pub(crate) fn new(meter_provider: AggregateMeterProvider) -> Self { Self { + inner: Arc::new(RwLock::new(Self::new_inner(&meter_provider))), + meter_provider, + } + } + + fn new_inner(meter_provider: &AggregateMeterProvider) -> MetricsLayerInner { + MetricsLayerInner { meter: meter_provider.meter("apollo/router"), instruments: Default::default(), } } + /// Remove all the instruments from the metrics layer. These will be obtained again from the meter provider upon next use. + pub(crate) fn clear(&self) { + let mut inner = self.inner.write().expect("lock poisoned"); + *inner = Self::new_inner(&self.meter_provider); + } } impl Layer for MetricsLayer @@ -449,9 +466,10 @@ where S: Subscriber + for<'span> LookupSpan<'span>, { fn on_event(&self, event: &tracing::Event<'_>, _ctx: Context<'_, S>) { + let inner = self.inner.read().expect("lock poisoned"); let mut metric_visitor = MetricVisitor { - instruments: &self.instruments, - meter: &self.meter, + meter: &inner.meter, + instruments: &inner.instruments, metric: None, custom_attributes: Vec::new(), attributes_ignored: false, diff --git a/apollo-router/src/metrics/mod.rs b/apollo-router/src/metrics/mod.rs new file mode 100644 index 0000000000..bbfbe78d29 --- /dev/null +++ b/apollo-router/src/metrics/mod.rs @@ -0,0 +1,1093 @@ +#[cfg(test)] +use std::future::Future; +#[cfg(test)] +use std::pin::Pin; +use std::sync::OnceLock; + +#[cfg(test)] +use futures::FutureExt; + +use crate::metrics::aggregation::AggregateMeterProvider; + +pub(crate) mod aggregation; +pub(crate) mod filter; +pub(crate) mod layer; + +// During tests this is a task local so that we can test metrics without having to worry about other tests interfering. + +#[cfg(test)] +pub(crate) mod test_utils { + use std::fmt::Debug; + use std::fmt::Display; + use std::sync::Arc; + use std::sync::OnceLock; + use std::sync::Weak; + + use itertools::Itertools; + use num_traits::NumCast; + use num_traits::ToPrimitive; + use opentelemetry::sdk::metrics::data::Gauge; + use opentelemetry::sdk::metrics::data::Histogram; + use opentelemetry::sdk::metrics::data::ResourceMetrics; + use opentelemetry::sdk::metrics::data::Sum; + use opentelemetry::sdk::metrics::data::Temporality; + use opentelemetry::sdk::metrics::reader::AggregationSelector; + use opentelemetry::sdk::metrics::reader::MetricProducer; + use opentelemetry::sdk::metrics::reader::MetricReader; + use opentelemetry::sdk::metrics::reader::TemporalitySelector; + use opentelemetry::sdk::metrics::Aggregation; + use opentelemetry::sdk::metrics::InstrumentKind; + use opentelemetry::sdk::metrics::ManualReader; + use opentelemetry::sdk::metrics::MeterProviderBuilder; + use opentelemetry::sdk::metrics::Pipeline; + use opentelemetry::sdk::AttributeSet; + use opentelemetry_api::Array; + use opentelemetry_api::Context; + use opentelemetry_api::KeyValue; + use opentelemetry_api::Value; + use tokio::task_local; + + use crate::metrics::aggregation::AggregateMeterProvider; + use crate::metrics::aggregation::MeterProviderType; + use crate::metrics::filter::FilterMeterProvider; + task_local! { + pub(crate) static AGGREGATE_METER_PROVIDER_ASYNC: OnceLock<(AggregateMeterProvider, ClonableManualReader)>; + } + thread_local! { + pub(crate) static AGGREGATE_METER_PROVIDER: OnceLock<(AggregateMeterProvider, ClonableManualReader)> = OnceLock::new(); + } + + #[derive(Debug, Clone, Default)] + pub(crate) struct ClonableManualReader { + reader: Arc, + } + + impl TemporalitySelector for ClonableManualReader { + fn temporality(&self, kind: InstrumentKind) -> Temporality { + self.reader.temporality(kind) + } + } + + impl AggregationSelector for ClonableManualReader { + fn aggregation(&self, kind: InstrumentKind) -> Aggregation { + self.reader.aggregation(kind) + } + } + impl MetricReader for ClonableManualReader { + fn register_pipeline(&self, pipeline: Weak) { + self.reader.register_pipeline(pipeline) + } + + fn register_producer(&self, producer: Box) { + self.reader.register_producer(producer) + } + + fn collect(&self, rm: &mut ResourceMetrics) -> opentelemetry_api::metrics::Result<()> { + self.reader.collect(rm) + } + + fn force_flush(&self, cx: &Context) -> opentelemetry_api::metrics::Result<()> { + self.reader.force_flush(cx) + } + + fn shutdown(&self) -> opentelemetry_api::metrics::Result<()> { + self.reader.shutdown() + } + } + + fn create_test_meter_provider() -> (AggregateMeterProvider, ClonableManualReader) { + { + let meter_provider = AggregateMeterProvider::default(); + let reader = ClonableManualReader::default(); + + meter_provider.set( + MeterProviderType::Public, + Some(FilterMeterProvider::public_metrics( + MeterProviderBuilder::default() + .with_reader(reader.clone()) + .build(), + )), + ); + + (meter_provider, reader) + } + } + pub(crate) fn meter_provider_and_readers() -> (AggregateMeterProvider, ClonableManualReader) { + if tokio::runtime::Handle::try_current().is_ok() { + if let Ok(task_local) = AGGREGATE_METER_PROVIDER_ASYNC + .try_with(|cell| cell.get_or_init(create_test_meter_provider).clone()) + { + task_local + } else { + // We need to silently fail here. Otherwise we fail every multi-threaded test that touches metrics + ( + AggregateMeterProvider::default(), + ClonableManualReader::default(), + ) + } + } else { + AGGREGATE_METER_PROVIDER + .with(|cell| cell.get_or_init(create_test_meter_provider).clone()) + } + } + + pub(crate) struct Metrics { + resource_metrics: ResourceMetrics, + } + + impl Default for Metrics { + fn default() -> Self { + Metrics { + resource_metrics: ResourceMetrics { + resource: Default::default(), + scope_metrics: vec![], + }, + } + } + } + + pub(crate) fn collect_metrics() -> Metrics { + let mut metrics = Metrics::default(); + let (_, reader) = meter_provider_and_readers(); + reader.collect(&mut metrics.resource_metrics).unwrap(); + metrics + } + + impl Metrics { + pub(crate) fn find( + &self, + name: &str, + ) -> Option<&opentelemetry::sdk::metrics::data::Metric> { + self.resource_metrics + .scope_metrics + .iter() + .flat_map(|scope_metrics| { + scope_metrics + .metrics + .iter() + .filter(|metric| metric.name == name) + }) + .next() + } + + pub(crate) fn assert( + &self, + name: &str, + ty: MetricType, + value: T, + attributes: &[KeyValue], + ) { + let attributes = AttributeSet::from(attributes); + if let Some(value) = value.to_u64() { + if self.metric_exists(name, &ty, value, &attributes) { + return; + } + } + + if let Some(value) = value.to_i64() { + if self.metric_exists(name, &ty, value, &attributes) { + return; + } + } + + if let Some(value) = value.to_f64() { + if self.metric_exists(name, &ty, value, &attributes) { + return; + } + } + + self.panic_metric_not_found(name, value, &attributes); + } + + fn panic_metric_not_found( + &self, + name: &str, + value: T, + attributes: &AttributeSet, + ) { + panic!( + "metric: {}, {}, {} not found.\nMetrics that were found:\n{}", + name, + value, + Self::pretty_attributes(attributes), + self.resource_metrics + .scope_metrics + .iter() + .flat_map(|scope_metrics| { scope_metrics.metrics.iter() }) + .flat_map(|metric| { Self::pretty_metric(metric) }) + .map(|metric| { format!(" {}", metric) }) + .join("\n") + ) + } + + fn pretty_metric(metric: &opentelemetry::sdk::metrics::data::Metric) -> Vec { + let mut results = Vec::new(); + results.append(&mut Self::pretty_data_point::(metric)); + results.append(&mut Self::pretty_data_point::(metric)); + results.append(&mut Self::pretty_data_point::(metric)); + results + } + + fn pretty_data_point( + metric: &opentelemetry::sdk::metrics::data::Metric, + ) -> Vec { + let mut results = Vec::new(); + if let Some(gauge) = metric.data.as_any().downcast_ref::>() { + for datapoint in gauge.data_points.iter() { + results.push(format!( + "\"{}\", {}, {}", + metric.name, + datapoint.value, + Self::pretty_attributes(&datapoint.attributes) + )); + } + } + if let Some(sum) = metric.data.as_any().downcast_ref::>() { + for datapoint in sum.data_points.iter() { + results.push(format!( + "\"{}\", {}, {}", + metric.name, + datapoint.value, + Self::pretty_attributes(&datapoint.attributes) + )); + } + } + if let Some(histogram) = metric.data.as_any().downcast_ref::>() { + for datapoint in histogram.data_points.iter() { + results.push(format!( + "\"{}\", {}, {}", + metric.name, + datapoint.sum, + Self::pretty_attributes(&datapoint.attributes) + )); + } + } + + results + } + + fn pretty_attributes(attributes: &AttributeSet) -> String { + attributes + .iter() + .map(|(key, value)| { + format!( + "\"{}\" => {}", + key.as_str(), + match value { + Value::Bool(v) => { + v.to_string() + } + Value::I64(v) => { + v.to_string() + } + Value::F64(v) => { + format!("{}f64", v) + } + Value::String(v) => { + format!("\"{}\"", v) + } + Value::Array(Array::Bool(v)) => { + format!("[{}]", v.iter().map(|v| v.to_string()).join(", ")) + } + Value::Array(Array::F64(v)) => { + format!("[{}]", v.iter().map(|v| format!("{}f64", v)).join(", ")) + } + Value::Array(Array::I64(v)) => { + format!("[{}]", v.iter().map(|v| v.to_string()).join(", ")) + } + Value::Array(Array::String(v)) => { + format!("[{}]", v.iter().map(|v| format!("\"{}\"", v)).join(", ")) + } + } + ) + }) + .join(", ") + } + + fn metric_exists( + &self, + name: &str, + ty: &MetricType, + value: T, + attributes: &AttributeSet, + ) -> bool { + if let Some(metric) = self.find(name) { + // Try to downcast the metric to each type of aggregation and assert that the value is correct. + if let Some(gauge) = metric.data.as_any().downcast_ref::>() { + // Find the datapoint with the correct attributes. + if matches!(ty, MetricType::Gauge) { + return gauge.data_points.iter().any(|datapoint| { + datapoint.attributes == *attributes && datapoint.value == value + }); + } + } else if let Some(sum) = metric.data.as_any().downcast_ref::>() { + // Note that we can't actually tell if the sum is monotonic or not, so we just check if it's a sum. + if matches!(ty, MetricType::Counter | MetricType::UpDownCounter) { + return sum.data_points.iter().any(|datapoint| { + datapoint.attributes == *attributes && datapoint.value == value + }); + } + } else if let Some(histogram) = metric.data.as_any().downcast_ref::>() + { + if matches!(ty, MetricType::Histogram) { + if let Some(value) = value.to_u64() { + return histogram.data_points.iter().any(|datapoint| { + datapoint.attributes == *attributes && datapoint.count == value + }); + } + } + } + } + false + } + } + + pub(crate) enum MetricType { + Counter, + UpDownCounter, + Histogram, + Gauge, + } +} +#[cfg(test)] +pub(crate) fn meter_provider() -> AggregateMeterProvider { + test_utils::meter_provider_and_readers().0 +} + +#[cfg(test)] +pub(crate) use test_utils::collect_metrics; + +#[cfg(not(test))] +static AGGREGATE_METER_PROVIDER: OnceLock = OnceLock::new(); +#[cfg(not(test))] +pub(crate) fn meter_provider() -> AggregateMeterProvider { + AGGREGATE_METER_PROVIDER + .get_or_init(Default::default) + .clone() +} + +#[macro_export] +/// Get or create a u64 monotonic counter metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! u64_counter { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(u64, counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(u64, counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(u64, counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(u64, counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(u64, counter, add, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(u64, counter, add, $name, $description, $value, []); + } +} + +/// Get or create a f64 monotonic counter metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! f64_counter { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, counter, add, $name, $description, $value, &attributes); + }; + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(f64, counter, add, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(f64, counter, add, $name, $description, $value, &[]); + } +} + +/// Get or create an i64 up down counter metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. + +#[allow(unused_macros)] +macro_rules! i64_up_down_counter { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(i64, up_down_counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(i64, up_down_counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(i64, up_down_counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(i64, up_down_counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(i64, up_down_counter, add, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(i64, up_down_counter, add, $name, $description, $value, &[]); + }; +} + +/// Get or create an f64 up down counter metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! f64_up_down_counter { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, up_down_counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, up_down_counter, add, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, up_down_counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, up_down_counter, add, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(f64, up_down_counter, add, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(f64, up_down_counter, add, $name, $description, $value, &[]); + }; +} + +/// Get or create an f64 histogram metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! f64_histogram { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(f64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(f64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(f64, histogram, record, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(f64, histogram, record, $name, $description, $value, &[]); + }; +} + +/// Get or create an u64 histogram metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! u64_histogram { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(u64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(u64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(u64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(u64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(u64, histogram, record, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(u64, histogram, record, $name, $description, $value, &[]); + }; +} + +/// Get or create an i64 histogram metric and add a value to it +/// +/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer. +/// The reason for this is that the MetricsLayer has: +/// * No support for dynamic attributes +/// * No support dynamic metrics. +/// * Imperfect mapping to metrics API that can only be checked at runtime. +/// New metrics should be added using these macros. +#[allow(unused_macros)] +macro_rules! i64_histogram { + ($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(i64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($($name:ident).+, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(i64, histogram, record, stringify!($($name).+), $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + metric!(i64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + metric!(i64, histogram, record, $name, $description, $value, &attributes); + }; + + ($name:literal, $description:literal, $value: expr, $attrs: expr) => { + metric!(i64, histogram, record, $name, $description, $value, $attrs); + }; + + ($name:literal, $description:literal, $value: expr) => { + metric!(i64, histogram, record, $name, $description, $value, &[]); + }; +} + +thread_local! { + // This is used exactly once in testing callsite caching. + #[cfg(test)] + pub(crate) static CACHE_CALLSITE: std::sync::atomic::AtomicBool = const {std::sync::atomic::AtomicBool::new(false)}; +} +macro_rules! metric { + ($ty:ident, $instrument:ident, $mutation:ident, $name:expr, $description:literal, $value: expr, $attrs: expr) => { + + // The way this works is that we have a static at each call site that holds a weak reference to the instrument. + // We make a call we try to upgrade the weak reference. If it succeeds we use the instrument. + // Otherwise we create a new instrument and update the static. + // The aggregate meter provider is used to hold on to references of all instruments that have been created and will clear references when the underlying configuration has changed. + // There is a Mutex involved, however it is only locked for the duration of the upgrade once the instrument has been created. + // The Reason a Mutex is used rather than an RwLock is that we are not holding the lock for any significant period of time and the cost of an RwLock is potentially higher. + // If we profile and deem it's worth switching to RwLock then we can do that. + + paste::paste! { + { + // There is a single test for caching callsites. Other tests do not cache because they will interfere with each other due to them using a task local meter provider to aid testing. + #[cfg(test)] + let cache_callsite = crate::metrics::CACHE_CALLSITE.with(|cell| cell.load(std::sync::atomic::Ordering::SeqCst)); + + // The compiler will optimize this in non test builds + #[cfg(not(test))] + let cache_callsite = true; + + if cache_callsite { + static INSTRUMENT_CACHE: std::sync::OnceLock]<$ty>>>> = std::sync::OnceLock::new(); + + let mut instrument_guard = INSTRUMENT_CACHE + .get_or_init(|| { + let meter_provider = crate::metrics::meter_provider(); + let instrument_ref = meter_provider.create_registered_instrument(|p| p.meter("apollo/router").[<$ty _ $instrument>]($name).with_description($description).init()); + std::sync::Mutex::new(std::sync::Arc::downgrade(&instrument_ref)) + }) + .lock() + .expect("lock poisoned"); + let instrument = if let Some(instrument) = instrument_guard.upgrade() { + // Fast path, we got the instrument, drop the mutex guard immediately. + drop(instrument_guard); + instrument + } else { + // Slow path, we need to obtain the instrument again. + let meter_provider = crate::metrics::meter_provider(); + let instrument_ref = meter_provider.create_registered_instrument(|p| p.meter("apollo/router").[<$ty _ $instrument>]($name).with_description($description).init()); + *instrument_guard = std::sync::Arc::downgrade(&instrument_ref); + // We've updated the instrument and got a strong reference to it. We can drop the mutex guard now. + drop(instrument_guard); + instrument_ref + }; + instrument.$mutation($value, &$attrs); + } + else { + let meter_provider = crate::metrics::meter_provider(); + let meter = opentelemetry::metrics::MeterProvider::meter(&meter_provider, "apollo/router"); + let instrument = meter.[<$ty _ $instrument>]($name).with_description($description).init(); + instrument.$mutation($value, &$attrs); + } + } + } + }; +} + +#[cfg(test)] +macro_rules! assert_counter { + ($($name:ident).+, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Counter, $value, &attributes); + }; + + ($($name:ident).+, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Counter, $value, &attributes); + }; + + ($name:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Counter, $value, &attributes); + }; + + ($name:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Counter, $value, &attributes); + }; + + ($name:literal, $value: expr) => { + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Counter, $value, &[]); + }; +} + +#[cfg(test)] +macro_rules! assert_up_down_counter { + ($($name:ident).+, $ty: expr, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::UpDownCounter, $value, &attributes); + }; + + ($($name:ident).+, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::UpDownCounter, $value, &attributes); + }; + + ($name:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::UpDownCounter, $value, &attributes); + }; + + ($name:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::UpDownCounter, $value, &attributes); + }; + + ($name:literal, $value: expr) => { + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::UpDownCounter, $value, &[]); + }; +} + +#[cfg(test)] +macro_rules! assert_gauge { + ($($name:ident).+, $ty: expr, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Gauge, $value, &attributes); + }; + + ($($name:ident).+, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Gauge, $value, &attributes); + }; + + ($name:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Gauge, $value, &attributes); + }; + + ($name:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Gauge, $value, &attributes); + }; + + ($name:literal, $value: expr) => { + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Gauge, $value, &[]); + }; +} + +#[cfg(test)] +macro_rules! assert_histogram { + ($($name:ident).+, $ty: expr, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Histogram, $value, &attributes); + }; + + ($($name:ident).+, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert(stringify!($($name).+), crate::metrics::test_utils::MetricType::Histogram, $value, &attributes); + }; + + ($name:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new($attr_key, $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Histogram, $value, &attributes); + }; + + ($name:literal, $value: expr, $($($attr_key:ident).+ = $attr_value:expr),+) => { + let attributes = vec![$(opentelemetry::KeyValue::new(stringify!($($attr_key).+), $attr_value)),+]; + crate::metrics::collect_metrics().assert($name, crate::metrics::test_utils::MetricType::Histogram, $value, &attributes); + }; + + ($name:literal, $value: expr) => { + crate::metrics::collect_metrics().assert($name, $value, &[]); + }; +} + +#[cfg(test)] +pub(crate) type MetricFuture = Pin::Output> + Send>>; + +#[cfg(test)] +pub(crate) trait FutureMetricsExt { + fn with_metrics( + self, + ) -> tokio::task::futures::TaskLocalFuture< + OnceLock<(AggregateMeterProvider, test_utils::ClonableManualReader)>, + MetricFuture, + > + where + Self: Sized + Future + Send + 'static, + ::Output: Send + 'static, + { + test_utils::AGGREGATE_METER_PROVIDER_ASYNC.scope( + Default::default(), + async move { + let result = self.await; + let _ = tokio::task::spawn_blocking(|| { + meter_provider().shutdown(); + }) + .await; + result + } + .boxed(), + ) + } +} + +#[cfg(test)] +impl FutureMetricsExt for T where T: Future {} + +#[cfg(test)] +mod test { + use opentelemetry_api::metrics::MeterProvider; + use opentelemetry_api::KeyValue; + + use crate::metrics::aggregation::MeterProviderType; + use crate::metrics::meter_provider; + use crate::metrics::FutureMetricsExt; + + #[test] + fn test_gauge() { + meter_provider() + .meter("test") + .u64_observable_gauge("test") + .with_callback(|m| m.observe(5, &[])) + .init(); + assert_gauge!("test", 5); + } + + #[test] + fn test_no_attributes() { + u64_counter!("test", "test description", 1); + assert_counter!("test", 1); + } + + #[test] + fn test_dynamic_attributes() { + let attributes = vec![KeyValue::new("attr", "val")]; + u64_counter!("test", "test description", 1, attributes); + assert_counter!("test", 1, "attr" = "val"); + } + + #[test] + fn test_multiple_calls() { + fn my_method(val: &'static str) { + u64_counter!("test", "test description", 1, "attr" = val); + } + + my_method("jill"); + my_method("jill"); + my_method("bob"); + assert_counter!("test", 2, "attr" = "jill"); + assert_counter!("test", 1, "attr" = "bob"); + } + + #[test] + fn test_non_async() { + // Each test is run in a separate thread, metrics are stored in a thread local. + u64_counter!("test", "test description", 1, "attr" = "val"); + assert_counter!("test", 1, "attr" = "val"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_async_multi() { + // Multi-threaded runtime needs to use a tokio task local to avoid tests interfering with each other + async { + u64_counter!("test", "test description", 1, "attr" = "val"); + assert_counter!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_async_single() { + async { + // It's a single threaded tokio runtime, so we can still use a thread local + u64_counter!("test", "test description", 1, "attr" = "val"); + assert_counter!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_u64_counter() { + async { + u64_counter!("test", "test description", 1, attr = "val"); + u64_counter!("test", "test description", 1, attr.test = "val"); + u64_counter!("test", "test description", 1, attr.test_underscore = "val"); + u64_counter!( + test.dot, + "test description", + 1, + "attr.test_underscore" = "val" + ); + u64_counter!( + test.dot, + "test description", + 1, + attr.test_underscore = "val" + ); + assert_counter!("test", 1, "attr" = "val"); + assert_counter!("test", 1, "attr.test" = "val"); + assert_counter!("test", 1, attr.test_underscore = "val"); + assert_counter!(test.dot, 2, attr.test_underscore = "val"); + assert_counter!(test.dot, 2, "attr.test_underscore" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_f64_counter() { + async { + f64_counter!("test", "test description", 1.5, "attr" = "val"); + assert_counter!("test", 1.5, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_i64_up_down_counter() { + async { + i64_up_down_counter!("test", "test description", 1, "attr" = "val"); + assert_up_down_counter!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_f64_up_down_counter() { + async { + f64_up_down_counter!("test", "test description", 1.5, "attr" = "val"); + assert_up_down_counter!("test", 1.5, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_u64_histogram() { + async { + u64_histogram!("test", "test description", 1, "attr" = "val"); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_i64_histogram() { + async { + i64_histogram!("test", "test description", 1, "attr" = "val"); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_f64_histogram() { + async { + f64_histogram!("test", "test description", 1.0, "attr" = "val"); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + #[should_panic] + async fn test_type_histogram() { + async { + f64_histogram!("test", "test description", 1.0, "attr" = "val"); + assert_counter!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + #[should_panic] + async fn test_type_counter() { + async { + f64_counter!("test", "test description", 1.0, "attr" = "val"); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + #[should_panic] + async fn test_type_up_down_counter() { + async { + f64_up_down_counter!("test", "test description", 1.0, "attr" = "val"); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + #[should_panic] + async fn test_type_gauge() { + async { + meter_provider() + .meter("test") + .u64_observable_gauge("test") + .with_callback(|m| m.observe(5, &[])) + .init(); + assert_histogram!("test", 1, "attr" = "val"); + } + .with_metrics() + .await; + } + + #[test] + fn test_callsite_caching() { + // Creating instruments may be slow due to multiple levels of locking that needs to happen through the various metrics layers. + // Callsite caching is implemented to prevent this happening on every call. + // See the metric macro above to see more information. + super::CACHE_CALLSITE.with(|cell| cell.store(true, std::sync::atomic::Ordering::SeqCst)); + fn test() { + // This is a single callsite so should only have one metric + u64_counter!("test", "test description", 1, "attr" = "val"); + } + + // Callsite hasn't been used yet, so there should be no metrics + assert_eq!(meter_provider().registered_instruments(), 0); + + // Call the metrics, it will be registered + test(); + assert_counter!("test", 1, "attr" = "val"); + assert_eq!(meter_provider().registered_instruments(), 1); + + // Call the metrics again, but the second call will not register a new metric because it will have be retrieved from the static + test(); + assert_counter!("test", 2, "attr" = "val"); + assert_eq!(meter_provider().registered_instruments(), 1); + + // Force invalidation of instruments + meter_provider().set(MeterProviderType::PublicPrometheus, None); + assert_eq!(meter_provider().registered_instruments(), 0); + + // Slow path + test(); + assert_eq!(meter_provider().registered_instruments(), 1); + + // Fast path + test(); + assert_eq!(meter_provider().registered_instruments(), 1); + } +} diff --git a/apollo-router/src/plugins/telemetry/formatters/mod.rs b/apollo-router/src/plugins/telemetry/formatters/mod.rs index 7237ddfa1a..e6bbd4c782 100644 --- a/apollo-router/src/plugins/telemetry/formatters/mod.rs +++ b/apollo-router/src/plugins/telemetry/formatters/mod.rs @@ -10,10 +10,10 @@ use tracing_subscriber::fmt::FormatEvent; use tracing_subscriber::fmt::FormatFields; use tracing_subscriber::registry::LookupSpan; -use super::metrics::METRIC_PREFIX_COUNTER; -use super::metrics::METRIC_PREFIX_HISTOGRAM; -use super::metrics::METRIC_PREFIX_MONOTONIC_COUNTER; -use super::metrics::METRIC_PREFIX_VALUE; +use crate::metrics::layer::METRIC_PREFIX_COUNTER; +use crate::metrics::layer::METRIC_PREFIX_HISTOGRAM; +use crate::metrics::layer::METRIC_PREFIX_MONOTONIC_COUNTER; +use crate::metrics::layer::METRIC_PREFIX_VALUE; pub(crate) const TRACE_ID_FIELD_NAME: &str = "trace_id"; diff --git a/apollo-router/src/plugins/telemetry/metrics/aggregation.rs b/apollo-router/src/plugins/telemetry/metrics/aggregation.rs deleted file mode 100644 index 10d321af87..0000000000 --- a/apollo-router/src/plugins/telemetry/metrics/aggregation.rs +++ /dev/null @@ -1,240 +0,0 @@ -use std::sync::Arc; - -use itertools::Itertools; -use opentelemetry::metrics::AsyncCounter; -use opentelemetry::metrics::AsyncGauge; -use opentelemetry::metrics::AsyncUpDownCounter; -use opentelemetry::metrics::Counter; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::InstrumentProvider; -use opentelemetry::metrics::Meter; -use opentelemetry::metrics::MeterProvider; -use opentelemetry::metrics::ObservableCounter; -use opentelemetry::metrics::ObservableGauge; -use opentelemetry::metrics::ObservableUpDownCounter; -use opentelemetry::metrics::SyncCounter; -use opentelemetry::metrics::SyncHistogram; -use opentelemetry::metrics::SyncUpDownCounter; -use opentelemetry::metrics::Unit; -use opentelemetry::metrics::UpDownCounter; -use opentelemetry::Context; -use opentelemetry::InstrumentationLibrary; -use opentelemetry::KeyValue; - -#[derive(Clone, Default)] -pub(crate) struct AggregateMeterProvider { - providers: Vec>, -} -impl AggregateMeterProvider { - pub(crate) fn new( - providers: Vec>, - ) -> AggregateMeterProvider { - AggregateMeterProvider { providers } - } -} - -impl MeterProvider for AggregateMeterProvider { - fn versioned_meter( - &self, - name: &'static str, - version: Option<&'static str>, - schema_url: Option<&'static str>, - ) -> Meter { - Meter::new( - InstrumentationLibrary::new(name, version, schema_url), - Arc::new(AggregateInstrumentProvider { - meters: self - .providers - .iter() - .map(|p| p.versioned_meter(name, version, schema_url)) - .collect(), - }), - ) - } -} - -pub(crate) struct AggregateInstrumentProvider { - meters: Vec, -} - -pub(crate) struct AggregateCounter { - delegates: Vec>, -} - -impl SyncCounter for AggregateCounter { - fn add(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for counter in &self.delegates { - counter.add(cx, value, attributes) - } - } -} - -pub(crate) struct AggregateObservableCounter { - delegates: Vec>, -} - -impl AsyncCounter for AggregateObservableCounter { - fn observe(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for counter in &self.delegates { - counter.observe(cx, value, attributes) - } - } -} - -pub(crate) struct AggregateHistogram { - delegates: Vec>, -} - -impl SyncHistogram for AggregateHistogram { - fn record(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for histogram in &self.delegates { - histogram.record(cx, value, attributes) - } - } -} - -pub(crate) struct AggregateUpDownCounter { - delegates: Vec>, -} - -impl SyncUpDownCounter for AggregateUpDownCounter { - fn add(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for counter in &self.delegates { - counter.add(cx, value, attributes) - } - } -} - -pub(crate) struct AggregateObservableUpDownCounter { - delegates: Vec>, -} - -impl AsyncUpDownCounter for AggregateObservableUpDownCounter { - fn observe(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for counter in &self.delegates { - counter.observe(cx, value, attributes) - } - } -} - -pub(crate) struct AggregateObservableGauge { - delegates: Vec>, -} - -impl AsyncGauge for AggregateObservableGauge { - fn observe(&self, cx: &Context, value: T, attributes: &[KeyValue]) { - for gauge in &self.delegates { - gauge.observe(cx, value, attributes) - } - } -} - -macro_rules! aggregate_meter_fn { - ($name:ident, $ty:ty, $wrapper:ident, $implementation:ident) => { - fn $name( - &self, - name: String, - description: Option, - unit: Option, - ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { - let delegates = self - .meters - .iter() - .map(|p| { - let mut b = p.$name(name.clone()); - if let Some(description) = &description { - b = b.with_description(description); - } - if let Some(unit) = &unit { - b = b.with_unit(unit.clone()); - } - b.try_init() - }) - .try_collect()?; - Ok($wrapper::new(Arc::new($implementation { delegates }))) - } - }; -} - -impl InstrumentProvider for AggregateInstrumentProvider { - aggregate_meter_fn!(u64_counter, u64, Counter, AggregateCounter); - aggregate_meter_fn!(f64_counter, f64, Counter, AggregateCounter); - - aggregate_meter_fn!( - f64_observable_counter, - f64, - ObservableCounter, - AggregateObservableCounter - ); - aggregate_meter_fn!( - u64_observable_counter, - u64, - ObservableCounter, - AggregateObservableCounter - ); - - aggregate_meter_fn!(u64_histogram, u64, Histogram, AggregateHistogram); - aggregate_meter_fn!(f64_histogram, f64, Histogram, AggregateHistogram); - aggregate_meter_fn!(i64_histogram, i64, Histogram, AggregateHistogram); - - aggregate_meter_fn!( - i64_up_down_counter, - i64, - UpDownCounter, - AggregateUpDownCounter - ); - aggregate_meter_fn!( - f64_up_down_counter, - f64, - UpDownCounter, - AggregateUpDownCounter - ); - - aggregate_meter_fn!( - i64_observable_up_down_counter, - i64, - ObservableUpDownCounter, - AggregateObservableUpDownCounter - ); - aggregate_meter_fn!( - f64_observable_up_down_counter, - f64, - ObservableUpDownCounter, - AggregateObservableUpDownCounter - ); - - aggregate_meter_fn!( - f64_observable_gauge, - f64, - ObservableGauge, - AggregateObservableGauge - ); - aggregate_meter_fn!( - i64_observable_gauge, - i64, - ObservableGauge, - AggregateObservableGauge - ); - aggregate_meter_fn!( - u64_observable_gauge, - u64, - ObservableGauge, - AggregateObservableGauge - ); - - fn register_callback( - &self, - callback: Box, - ) -> opentelemetry::metrics::Result<()> { - // The reason that this is OK is that calling observe outside of a callback is a no-op. - // So the callback is called, an observable is updated, but only the observable associated with the correct meter will take effect - - let callback = Arc::new(callback); - for meter in &self.meters { - let callback = callback.clone(); - // If this fails there is no recovery as some callbacks may be registered - meter.register_callback(move |c| callback(c))? - } - Ok(()) - } -} diff --git a/apollo-router/src/plugins/telemetry/metrics/apollo.rs b/apollo-router/src/plugins/telemetry/metrics/apollo.rs index 6424705c24..59ee3f4735 100644 --- a/apollo-router/src/plugins/telemetry/metrics/apollo.rs +++ b/apollo-router/src/plugins/telemetry/metrics/apollo.rs @@ -4,10 +4,11 @@ use std::sync::atomic::Ordering; use std::sync::OnceLock; use std::time::Duration; -use opentelemetry::sdk::export::metrics::aggregation; -use opentelemetry::sdk::metrics::selectors; +use opentelemetry::runtime; +use opentelemetry::sdk::metrics::PeriodicReader; use opentelemetry::sdk::Resource; -use opentelemetry::KeyValue; +use opentelemetry_api::KeyValue; +use opentelemetry_otlp::MetricsExporterBuilder; use opentelemetry_otlp::WithExportConfig; use sys_info::hostname; use tonic::metadata::MetadataMap; @@ -19,9 +20,10 @@ use crate::plugins::telemetry::apollo::Config; use crate::plugins::telemetry::apollo_exporter::get_uname; use crate::plugins::telemetry::apollo_exporter::ApolloExporter; use crate::plugins::telemetry::config::MetricsCommon; -use crate::plugins::telemetry::metrics::filter::FilterMeterProvider; +use crate::plugins::telemetry::metrics::CustomAggregationSelector; use crate::plugins::telemetry::metrics::MetricsBuilder; use crate::plugins::telemetry::metrics::MetricsConfigurator; +use crate::plugins::telemetry::otlp::CustomTemporalitySelector; use crate::plugins::telemetry::tracing::BatchProcessorConfig; mod duration_histogram; @@ -102,13 +104,31 @@ impl Config { tracing::debug!(endpoint = %endpoint, "creating Apollo OTLP metrics exporter"); let mut metadata = MetadataMap::new(); metadata.insert("apollo.api.key", key.parse()?); + let exporter = MetricsExporterBuilder::Tonic( + opentelemetry_otlp::new_exporter() + .tonic() + .with_endpoint(endpoint.as_str()) + .with_timeout(batch_processor.max_export_timeout) + .with_metadata(metadata) + .with_compression(opentelemetry_otlp::Compression::Gzip), + ) + .build_metrics_exporter( + Box::new(CustomTemporalitySelector( + opentelemetry::sdk::metrics::data::Temporality::Delta, + )), + Box::new( + CustomAggregationSelector::builder() + .boundaries(default_buckets()) + .build(), + ), + )?; + let reader = PeriodicReader::builder(exporter, runtime::Tokio) + .with_interval(Duration::from_secs(60)) + .build(); - let exporter = opentelemetry_otlp::new_pipeline() - .metrics( - selectors::simple::histogram(default_buckets()), - aggregation::delta_temporality_selector(), - opentelemetry::runtime::Tokio, - ) + builder.apollo_meter_provider_builder = builder + .apollo_meter_provider_builder + .with_reader(reader) .with_resource(Resource::new([ KeyValue::new( "apollo.router.id", @@ -126,24 +146,12 @@ impl Config { ), KeyValue::new("apollo.client.host", hostname()?), KeyValue::new("apollo.client.uname", get_uname()?), - ])) - .with_period(Duration::from_secs(60)) - .with_exporter( - opentelemetry_otlp::new_exporter() - .tonic() - .with_endpoint(endpoint.as_str()) - .with_timeout(batch_processor.max_export_timeout) - .with_metadata(metadata), - ) - .build()?; - builder = - builder.with_meter_provider(FilterMeterProvider::apollo_metrics(exporter.clone())); - builder = builder.with_exporter(exporter); + ])); Ok(builder) } fn configure_apollo_metrics( - builder: MetricsBuilder, + mut builder: MetricsBuilder, endpoint: &Url, key: &str, reference: &str, @@ -155,7 +163,8 @@ impl Config { let exporter = ApolloExporter::new(endpoint, batch_processor_config, key, reference, schema_id)?; - Ok(builder.with_apollo_metrics_collector(exporter.start())) + builder.apollo_metrics_sender = exporter.start(); + Ok(builder) } } diff --git a/apollo-router/src/plugins/telemetry/metrics/filter.rs b/apollo-router/src/plugins/telemetry/metrics/filter.rs index 929a8a9d17..8b13789179 100644 --- a/apollo-router/src/plugins/telemetry/metrics/filter.rs +++ b/apollo-router/src/plugins/telemetry/metrics/filter.rs @@ -1,301 +1 @@ -use std::sync::Arc; -use buildstructor::buildstructor; -use opentelemetry::metrics::noop::NoopMeterProvider; -use opentelemetry::metrics::Counter; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::InstrumentProvider; -use opentelemetry::metrics::Meter; -use opentelemetry::metrics::MeterProvider; -use opentelemetry::metrics::ObservableCounter; -use opentelemetry::metrics::ObservableGauge; -use opentelemetry::metrics::ObservableUpDownCounter; -use opentelemetry::metrics::Unit; -use opentelemetry::metrics::UpDownCounter; -use opentelemetry::Context; -use opentelemetry::InstrumentationLibrary; -use regex::Regex; - -pub(crate) struct FilterMeterProvider { - delegate: T, - deny: Option, - allow: Option, -} - -#[buildstructor] -impl FilterMeterProvider { - #[builder] - fn new(delegate: T, deny: Option, allow: Option) -> Self { - FilterMeterProvider { - delegate, - deny, - allow, - } - } - - pub(crate) fn apollo_metrics(delegate: T) -> Self { - FilterMeterProvider::builder() - .delegate(delegate) - .allow( - Regex::new( - r"apollo\.(graphos\.cloud|router\.(operations?|config|schema|query))(\..*|$)", - ) - .expect("regex should have been valid"), - ) - .build() - } - - pub(crate) fn public_metrics(delegate: T) -> Self { - FilterMeterProvider::builder() - .delegate(delegate) - .deny( - Regex::new(r"apollo\.router\.(config|entities)(\..*|$)") - .expect("regex should have been valid"), - ) - .build() - } -} - -struct FilteredInstrumentProvider { - noop: Meter, - delegate: Meter, - deny: Option, - allow: Option, -} -macro_rules! filter_meter_fn { - ($name:ident, $ty:ty, $wrapper:ident) => { - fn $name( - &self, - name: String, - description: Option, - unit: Option, - ) -> opentelemetry::metrics::Result<$wrapper<$ty>> { - let mut builder = match (&self.deny, &self.allow) { - (Some(deny), Some(allow)) if deny.is_match(&name) && !allow.is_match(&name) => { - self.noop.$name(name) - } - (Some(deny), None) if deny.is_match(&name) => self.noop.$name(name), - (None, Some(allow)) if !allow.is_match(&name) => self.noop.$name(name), - (_, _) => self.delegate.$name(name), - }; - if let Some(description) = &description { - builder = builder.with_description(description); - } - if let Some(unit) = &unit { - builder = builder.with_unit(unit.clone()); - } - builder.try_init() - } - }; -} - -impl InstrumentProvider for FilteredInstrumentProvider { - filter_meter_fn!(u64_counter, u64, Counter); - filter_meter_fn!(f64_counter, f64, Counter); - - filter_meter_fn!(f64_observable_counter, f64, ObservableCounter); - filter_meter_fn!(u64_observable_counter, u64, ObservableCounter); - - filter_meter_fn!(u64_histogram, u64, Histogram); - filter_meter_fn!(f64_histogram, f64, Histogram); - filter_meter_fn!(i64_histogram, i64, Histogram); - - filter_meter_fn!(i64_up_down_counter, i64, UpDownCounter); - filter_meter_fn!(f64_up_down_counter, f64, UpDownCounter); - - filter_meter_fn!(i64_observable_up_down_counter, i64, ObservableUpDownCounter); - filter_meter_fn!(f64_observable_up_down_counter, f64, ObservableUpDownCounter); - - filter_meter_fn!(f64_observable_gauge, f64, ObservableGauge); - filter_meter_fn!(i64_observable_gauge, i64, ObservableGauge); - filter_meter_fn!(u64_observable_gauge, u64, ObservableGauge); - - fn register_callback( - &self, - callback: Box, - ) -> opentelemetry::metrics::Result<()> { - self.delegate.register_callback(callback) - } -} - -impl MeterProvider for FilterMeterProvider { - fn versioned_meter( - &self, - name: &'static str, - version: Option<&'static str>, - schema_url: Option<&'static str>, - ) -> Meter { - let delegate = self.delegate.versioned_meter(name, version, schema_url); - Meter::new( - InstrumentationLibrary::new(name, version, schema_url), - Arc::new(FilteredInstrumentProvider { - noop: NoopMeterProvider::new().versioned_meter(name, version, schema_url), - delegate, - deny: self.deny.clone(), - allow: self.allow.clone(), - }), - ) - } -} - -#[cfg(test)] -mod test { - use std::collections::HashSet; - use std::sync::atomic::AtomicU64; - use std::sync::atomic::Ordering; - use std::sync::Arc; - use std::sync::Mutex; - - use opentelemetry::metrics::noop; - use opentelemetry::metrics::Counter; - use opentelemetry::metrics::InstrumentProvider; - use opentelemetry::metrics::Meter; - use opentelemetry::metrics::MeterProvider; - use opentelemetry::metrics::Unit; - use opentelemetry::Context; - use opentelemetry::InstrumentationLibrary; - - use crate::plugins::telemetry::metrics::filter::FilterMeterProvider; - - #[derive(Default, Clone)] - struct MockInstrumentProvider { - #[allow(clippy::type_complexity)] - counters_created: Arc, Option)>>>, - callbacks_registered: Arc, - } - - impl InstrumentProvider for MockInstrumentProvider { - // We're only going to bother with testing counters and callbacks because the code is implemented as a macro and if it's right for counters it's right for everything else. - fn u64_counter( - &self, - name: String, - description: Option, - unit: Option, - ) -> opentelemetry::metrics::Result> { - self.counters_created - .lock() - .expect("lock should not be poisoned") - .insert((name, description, unit)); - Ok(Counter::new(Arc::new(noop::NoopSyncInstrument::new()))) - } - - fn register_callback( - &self, - _callback: Box, - ) -> opentelemetry::metrics::Result<()> { - self.callbacks_registered.fetch_add(1, Ordering::SeqCst); - Ok(()) - } - } - - #[derive(Default, Clone)] - struct MockMeterProvider { - instrument_provider: Arc, - } - - impl MeterProvider for MockMeterProvider { - fn versioned_meter( - &self, - name: &'static str, - version: Option<&'static str>, - schema_url: Option<&'static str>, - ) -> Meter { - Meter::new( - InstrumentationLibrary::new(name, version, schema_url), - self.instrument_provider.clone(), - ) - } - } - - #[test] - fn test_apollo_metrics() { - let delegate = MockMeterProvider::default(); - let filtered = FilterMeterProvider::apollo_metrics(delegate.clone()) - .versioned_meter("filtered", None, None); - filtered.u64_counter("apollo.router.operations").init(); - filtered.u64_counter("apollo.router.operations.test").init(); - filtered.u64_counter("apollo.graphos.cloud.test").init(); - filtered.u64_counter("apollo.router.unknown.test").init(); - assert!(delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.operations.test".to_string(), None, None))); - assert!(delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.operations".to_string(), None, None))); - assert!(delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.graphos.cloud.test".to_string(), None, None))); - assert!(!delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.unknown.test".to_string(), None, None))); - } - - #[test] - fn test_public_metrics() { - let delegate = MockMeterProvider::default(); - let filtered = FilterMeterProvider::public_metrics(delegate.clone()) - .versioned_meter("filtered", None, None); - filtered.u64_counter("apollo.router.config").init(); - filtered.u64_counter("apollo.router.config.test").init(); - filtered.u64_counter("apollo.router.entities").init(); - filtered.u64_counter("apollo.router.entities.test").init(); - assert!(!delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.config".to_string(), None, None))); - assert!(!delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.config.test".to_string(), None, None))); - assert!(!delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.entities".to_string(), None, None))); - assert!(!delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&("apollo.router.entities.test".to_string(), None, None))); - } - - #[test] - fn test_description_and_unit() { - let delegate = MockMeterProvider::default(); - let filtered = FilterMeterProvider::apollo_metrics(delegate.clone()) - .versioned_meter("filtered", None, None); - filtered - .u64_counter("apollo.router.operations") - .with_description("desc") - .with_unit(Unit::new("ms")) - .init(); - assert!(delegate - .instrument_provider - .counters_created - .lock() - .unwrap() - .contains(&( - "apollo.router.operations".to_string(), - Some("desc".to_string()), - Some(Unit::new("ms")) - ))); - } -} diff --git a/apollo-router/src/plugins/telemetry/metrics/mod.rs b/apollo-router/src/plugins/telemetry/metrics/mod.rs index 08a4f6637d..a6b0087da9 100644 --- a/apollo-router/src/plugins/telemetry/metrics/mod.rs +++ b/apollo-router/src/plugins/telemetry/metrics/mod.rs @@ -1,6 +1,5 @@ -use std::any::Any; use std::collections::HashMap; -use std::sync::Arc; +use std::time::Duration; use ::serde::Deserialize; use access_json::JSONQuery; @@ -8,9 +7,13 @@ use http::header::HeaderName; use http::response::Parts; use http::HeaderMap; use multimap::MultiMap; -use opentelemetry::metrics::Counter; -use opentelemetry::metrics::Histogram; -use opentelemetry::metrics::MeterProvider; +use opentelemetry::sdk::metrics::reader::AggregationSelector; +use opentelemetry::sdk::metrics::Aggregation; +use opentelemetry::sdk::metrics::InstrumentKind; +use opentelemetry::sdk::resource::ResourceDetector; +use opentelemetry::sdk::resource::SdkProvidedResourceDetector; +use opentelemetry::sdk::Resource; +use opentelemetry_api::KeyValue; use regex::Regex; use schemars::JsonSchema; use serde::Serialize; @@ -24,26 +27,17 @@ use crate::plugin::serde::deserialize_json_query; use crate::plugin::serde::deserialize_regex; use crate::plugins::telemetry::apollo_exporter::Sender; use crate::plugins::telemetry::config::AttributeValue; +use crate::plugins::telemetry::config::Conf; use crate::plugins::telemetry::config::MetricsCommon; -use crate::plugins::telemetry::metrics::aggregation::AggregateMeterProvider; use crate::router_factory::Endpoint; use crate::Context; use crate::ListenAddr; -pub(crate) mod aggregation; pub(crate) mod apollo; -pub(crate) mod filter; -pub(crate) mod layer; pub(crate) mod otlp; pub(crate) mod prometheus; pub(crate) mod span_metrics_exporter; - -pub(crate) const METRIC_PREFIX_MONOTONIC_COUNTER: &str = "monotonic_counter."; -pub(crate) const METRIC_PREFIX_COUNTER: &str = "counter."; -pub(crate) const METRIC_PREFIX_HISTOGRAM: &str = "histogram."; -pub(crate) const METRIC_PREFIX_VALUE: &str = "value."; - -pub(crate) type MetricsExporterHandle = Box; +static UNKNOWN_SERVICE: &str = "unknown_service"; #[derive(Debug, Clone, Deserialize, JsonSchema)] #[serde(deny_unknown_fields)] @@ -487,52 +481,95 @@ impl AttributesForwardConf { } } -#[derive(Default)] pub(crate) struct MetricsBuilder { - exporters: Vec, - meter_providers: Vec>, - custom_endpoints: MultiMap, - apollo_metrics: Sender, + pub(crate) public_meter_provider_builder: opentelemetry::sdk::metrics::MeterProviderBuilder, + pub(crate) apollo_meter_provider_builder: opentelemetry::sdk::metrics::MeterProviderBuilder, + pub(crate) prometheus_meter_provider: Option, + pub(crate) custom_endpoints: MultiMap, + pub(crate) apollo_metrics_sender: Sender, + pub(crate) resource: Resource, } -impl MetricsBuilder { - pub(crate) fn exporters(&mut self) -> Vec { - std::mem::take(&mut self.exporters) - } - pub(crate) fn meter_provider(&mut self) -> AggregateMeterProvider { - AggregateMeterProvider::new(std::mem::take(&mut self.meter_providers)) - } - pub(crate) fn custom_endpoints(&mut self) -> MultiMap { - std::mem::take(&mut self.custom_endpoints) - } - - pub(crate) fn apollo_metrics_provider(&mut self) -> Sender { - self.apollo_metrics.clone() +struct ConfigResourceDetector(MetricsCommon); + +impl ResourceDetector for ConfigResourceDetector { + fn detect(&self, _timeout: Duration) -> Resource { + let mut resource = Resource::new( + vec![ + self.0.service_name.clone().map(|service_name| { + KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + ) + }), + self.0.service_namespace.clone().map(|service_namespace| { + KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE, + service_namespace, + ) + }), + ] + .into_iter() + .flatten() + .collect::>(), + ); + resource = resource.merge(&mut Resource::new( + self.0 + .resources + .clone() + .into_iter() + .map(|(k, v)| KeyValue::new(k, v)), + )); + resource } } impl MetricsBuilder { - fn with_exporter(mut self, handle: T) -> Self { - self.exporters.push(Box::new(handle)); - self - } - - fn with_meter_provider( - mut self, - meter_provider: T, - ) -> Self { - self.meter_providers.push(Arc::new(meter_provider)); - self - } - - fn with_custom_endpoint(mut self, listen_addr: ListenAddr, endpoint: Endpoint) -> Self { - self.custom_endpoints.insert(listen_addr, endpoint); - self - } + pub(crate) fn new(config: &Conf) -> Self { + let metrics_common_config = config + .metrics + .clone() + .and_then(|m| m.common) + .unwrap_or_default(); + + let mut resource = Resource::from_detectors( + Duration::from_secs(0), + vec![ + Box::new(ConfigResourceDetector(metrics_common_config.clone())), + Box::new(SdkProvidedResourceDetector), + Box::new(opentelemetry::sdk::resource::EnvResourceDetector::new()), + ], + ); + + // Otel resources can be initialized from env variables, there is an override mechanism, but it's broken for service name as it will always override service.name + // If the service name is set to unknown service then override it from the config + if resource.get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) + == Some(UNKNOWN_SERVICE.into()) + { + if let Some(service_name) = Resource::from_detectors( + Duration::from_secs(0), + vec![Box::new(ConfigResourceDetector( + metrics_common_config.clone(), + ))], + ) + .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME) + { + resource = resource.merge(&mut Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])); + } + } - fn with_apollo_metrics_collector(mut self, apollo_metrics: Sender) -> Self { - self.apollo_metrics = apollo_metrics; - self + Self { + resource: resource.clone(), + public_meter_provider_builder: opentelemetry::sdk::metrics::MeterProvider::builder() + .with_resource(resource.clone()), + apollo_meter_provider_builder: opentelemetry::sdk::metrics::MeterProvider::builder(), + prometheus_meter_provider: None, + custom_endpoints: MultiMap::new(), + apollo_metrics_sender: Sender::default(), + } } } @@ -544,24 +581,38 @@ pub(crate) trait MetricsConfigurator { ) -> Result; } -#[derive(Clone)] -pub(crate) struct BasicMetrics { - pub(crate) http_requests_total: Counter, - pub(crate) http_requests_duration: Histogram, +#[derive(Clone, Default, Debug)] +pub(crate) struct CustomAggregationSelector { + boundaries: Vec, + record_min_max: bool, +} + +#[buildstructor::buildstructor] +impl CustomAggregationSelector { + #[builder] + pub(crate) fn new( + boundaries: Vec, + record_min_max: Option, + ) -> CustomAggregationSelector { + Self { + boundaries, + record_min_max: record_min_max.unwrap_or(true), + } + } } -impl BasicMetrics { - pub(crate) fn new(meter_provider: &impl MeterProvider) -> BasicMetrics { - let meter = meter_provider.meter("apollo/router"); - BasicMetrics { - http_requests_total: meter - .u64_counter("apollo_router_http_requests_total") - .with_description("Total number of HTTP requests made.") - .init(), - http_requests_duration: meter - .f64_histogram("apollo_router_http_request_duration_seconds") - .with_description("Duration of HTTP requests.") - .init(), +impl AggregationSelector for CustomAggregationSelector { + fn aggregation(&self, kind: InstrumentKind) -> Aggregation { + match kind { + InstrumentKind::Counter + | InstrumentKind::UpDownCounter + | InstrumentKind::ObservableCounter + | InstrumentKind::ObservableUpDownCounter => Aggregation::Sum, + InstrumentKind::ObservableGauge => Aggregation::LastValue, + InstrumentKind::Histogram => Aggregation::ExplicitBucketHistogram { + boundaries: self.boundaries.clone(), + record_min_max: self.record_min_max, + }, } } } diff --git a/apollo-router/src/plugins/telemetry/metrics/otlp.rs b/apollo-router/src/plugins/telemetry/metrics/otlp.rs index cbbdb06514..d38d91ad0e 100644 --- a/apollo-router/src/plugins/telemetry/metrics/otlp.rs +++ b/apollo-router/src/plugins/telemetry/metrics/otlp.rs @@ -1,16 +1,14 @@ -use opentelemetry::sdk::export::metrics::aggregation; -use opentelemetry::sdk::metrics::selectors; -use opentelemetry::sdk::Resource; -use opentelemetry::KeyValue; +use opentelemetry::runtime; +use opentelemetry::sdk::metrics::PeriodicReader; use opentelemetry_otlp::HttpExporterBuilder; +use opentelemetry_otlp::MetricsExporterBuilder; use opentelemetry_otlp::TonicExporterBuilder; use tower::BoxError; use crate::plugins::telemetry::config::MetricsCommon; -use crate::plugins::telemetry::metrics::filter::FilterMeterProvider; +use crate::plugins::telemetry::metrics::CustomAggregationSelector; use crate::plugins::telemetry::metrics::MetricsBuilder; use crate::plugins::telemetry::metrics::MetricsConfigurator; -use crate::plugins::telemetry::otlp::Temporality; // TODO Remove MetricExporterBuilder once upstream issue is fixed // This has to exist because Http is not currently supported for metrics export @@ -43,41 +41,22 @@ impl MetricsConfigurator for super::super::otlp::Config { match exporter.exporter { Some(exporter) => { - let exporter = match self.temporality { - Temporality::Cumulative => opentelemetry_otlp::new_pipeline() - .metrics( - selectors::simple::histogram(metrics_config.buckets.clone()), - aggregation::stateless_temporality_selector(), - opentelemetry::runtime::Tokio, - ) - .with_exporter(exporter) - .with_resource(Resource::new( - metrics_config - .resources - .clone() - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - )) - .build()?, - Temporality::Delta => opentelemetry_otlp::new_pipeline() - .metrics( - selectors::simple::histogram(metrics_config.buckets.clone()), - aggregation::delta_temporality_selector(), - opentelemetry::runtime::Tokio, - ) - .with_exporter(exporter) - .with_resource(Resource::new( - metrics_config - .resources - .clone() - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - )) - .build()?, - }; - builder = builder - .with_meter_provider(FilterMeterProvider::public_metrics(exporter.clone())); - builder = builder.with_exporter(exporter); + let exporter = MetricsExporterBuilder::Tonic(exporter).build_metrics_exporter( + (&self.temporality).into(), + Box::new( + CustomAggregationSelector::builder() + .boundaries(metrics_config.buckets.clone()) + .build(), + ), + )?; + + builder.public_meter_provider_builder = + builder.public_meter_provider_builder.with_reader( + PeriodicReader::builder(exporter, runtime::Tokio) + .with_interval(self.batch_processor.scheduled_delay) + .with_timeout(self.batch_processor.max_export_timeout) + .build(), + ); Ok(builder) } None => Err("otlp metric export does not support http yet".into()), diff --git a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs index d5c8a4b2c4..591165e8fe 100644 --- a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs +++ b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs @@ -5,13 +5,9 @@ use std::task::Poll; use futures::future::BoxFuture; use http::StatusCode; use once_cell::sync::Lazy; -use opentelemetry::sdk::export::metrics::aggregation; -use opentelemetry::sdk::metrics::controllers; -use opentelemetry::sdk::metrics::controllers::BasicController; -use opentelemetry::sdk::metrics::processors; -use opentelemetry::sdk::metrics::selectors; +use opentelemetry::sdk::metrics::MeterProvider; +use opentelemetry::sdk::metrics::MeterProviderBuilder; use opentelemetry::sdk::Resource; -use opentelemetry::KeyValue; use prometheus::Encoder; use prometheus::Registry; use prometheus::TextEncoder; @@ -22,7 +18,7 @@ use tower::ServiceExt; use tower_service::Service; use crate::plugins::telemetry::config::MetricsCommon; -use crate::plugins::telemetry::metrics::filter::FilterMeterProvider; +use crate::plugins::telemetry::metrics::CustomAggregationSelector; use crate::plugins::telemetry::metrics::MetricsBuilder; use crate::plugins::telemetry::metrics::MetricsConfigurator; use crate::router_factory::Endpoint; @@ -62,18 +58,26 @@ impl Default for Config { } // Prometheus metrics are special. We want them to persist between restarts if possible. -// This means reusing the existing controller if we can. -// These statics will keep track of new controllers for commit when the telemetry plugin is activated. -static CONTROLLER: Lazy>> = Lazy::new(Default::default); -static NEW_CONTROLLER: Lazy>> = Lazy::new(Default::default); - -pub(crate) fn commit_new_controller() { - if let Some(controller) = NEW_CONTROLLER.lock().expect("lock poisoned").take() { - tracing::debug!("committing prometheus controller"); - CONTROLLER +// This means reusing the existing registry and meter provider if we can. +// These statics will keep track of new registry for commit when the telemetry plugin is activated. +static EXISTING_PROMETHEUS: Lazy>> = + Lazy::new(Default::default); +static NEW_PROMETHEUS: Lazy>> = + Lazy::new(Default::default); + +#[derive(PartialEq, Clone)] +struct PrometheusConfig { + resource: Resource, + buckets: Vec, +} + +pub(crate) fn commit_prometheus() { + if let Some(prometheus) = NEW_PROMETHEUS.lock().expect("lock poisoned").take() { + tracing::debug!("committing prometheus registry"); + EXISTING_PROMETHEUS .lock() .expect("lock poisoned") - .replace(controller); + .replace(prometheus); } } @@ -83,56 +87,81 @@ impl MetricsConfigurator for Config { mut builder: MetricsBuilder, metrics_config: &MetricsCommon, ) -> Result { + // Prometheus metrics are special, they must persist between reloads. This means that we only want to create something new if the resources have changed. + // The prometheus exporter, and the associated registry are linked, so replacing one means replacing the other. + + let prometheus_config = PrometheusConfig { + resource: builder.resource.clone(), + buckets: metrics_config.buckets.clone(), + }; + if self.enabled { - let mut controller = controllers::basic(processors::factory( - selectors::simple::histogram(metrics_config.buckets.clone()), - aggregation::stateless_temporality_selector(), - )) - .with_resource(Resource::new( - metrics_config - .resources - .clone() - .into_iter() - .map(|(k, v)| KeyValue::new(k, v)), - )) - .build(); - - // Check the last controller to see if the resources are the same, if they are we can use it as is. + // Check the last registry to see if the resources are the same, if they are we can use it as is. // Otherwise go with the new controller and store it so that it can be committed during telemetry activation. - if let Some(last_controller) = CONTROLLER.lock().expect("lock poisoned").clone() { - if controller.resource() == last_controller.resource() { - tracing::debug!("prometheus controller can be reused"); - controller = last_controller + // Note that during tests the prom registry cannot be reused as we have a different meter provider for each test. + // Prom reloading IS tested in an integration test. + #[cfg(not(test))] + if let Some((last_config, last_registry)) = + EXISTING_PROMETHEUS.lock().expect("lock poisoned").clone() + { + if prometheus_config == last_config { + tracing::debug!("prometheus registry can be reused"); + builder.custom_endpoints.insert( + self.listen.clone(), + Endpoint::from_router_service( + self.path.clone(), + PrometheusService { + registry: last_registry.clone(), + } + .boxed(), + ), + ); + return Ok(builder); } else { - tracing::debug!("prometheus controller cannot be reused"); + tracing::debug!("prometheus registry cannot be reused"); } } - NEW_CONTROLLER - .lock() - .expect("lock poisoned") - .replace(controller.clone()); - let exporter = opentelemetry_prometheus::exporter(controller).try_init()?; - - builder = builder.with_custom_endpoint( + let registry = prometheus::Registry::new(); + + let exporter = opentelemetry_prometheus::exporter() + .with_aggregation_selector( + CustomAggregationSelector::builder() + .boundaries(metrics_config.buckets.clone()) + .record_min_max(true) + .build(), + ) + .with_registry(registry.clone()) + .build()?; + + let meter_provider = MeterProvider::builder() + .with_reader(exporter) + .with_resource(builder.resource.clone()) + .build(); + builder.custom_endpoints.insert( self.listen.clone(), Endpoint::from_router_service( self.path.clone(), PrometheusService { - registry: exporter.registry().clone(), + registry: registry.clone(), } .boxed(), ), ); - builder = builder.with_meter_provider(FilterMeterProvider::public_metrics( - exporter.meter_provider()?, - )); - builder = builder.with_exporter(exporter); + builder.prometheus_meter_provider = Some(meter_provider.clone()); + + NEW_PROMETHEUS + .lock() + .expect("lock poisoned") + .replace((prometheus_config, registry)); + tracing::info!( "Prometheus endpoint exposed at {}{}", self.listen, self.path ); + } else { + builder.prometheus_meter_provider = Some(MeterProviderBuilder::default().build()); } Ok(builder) } diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 976c7a6129..4c76dcf5b7 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -28,7 +28,6 @@ use opentelemetry::propagation::text_map_propagator::FieldIter; use opentelemetry::propagation::Extractor; use opentelemetry::propagation::Injector; use opentelemetry::propagation::TextMapPropagator; -use opentelemetry::sdk::metrics::controllers::BasicController; use opentelemetry::sdk::propagation::TextMapCompositePropagator; use opentelemetry::sdk::trace::Builder; use opentelemetry::trace::SpanContext; @@ -37,7 +36,6 @@ use opentelemetry::trace::TraceContextExt; use opentelemetry::trace::TraceFlags; use opentelemetry::trace::TraceState; use opentelemetry::trace::TracerProvider; -use opentelemetry::Context as OtelContext; use opentelemetry::KeyValue; use parking_lot::Mutex; use rand::Rng; @@ -68,11 +66,9 @@ use self::metrics::apollo::studio::SingleTypeStat; use self::metrics::AttributesForwardConf; use self::metrics::MetricsAttributesConf; use self::reload::reload_fmt; -use self::reload::reload_metrics; use self::reload::LayeredTracer; use self::reload::NullFieldFormatter; use self::reload::SamplingFilter; -use self::reload::OPENTELEMETRY_TRACER_HANDLE; use self::tracing::apollo_telemetry::APOLLO_PRIVATE_DURATION_NS; use super::traffic_shaping::cache::hash_request; use super::traffic_shaping::cache::hash_vary_headers; @@ -80,6 +76,9 @@ use super::traffic_shaping::cache::REPRESENTATIONS; use crate::axum_factory::utils::REQUEST_SPAN_NAME; use crate::context::OPERATION_NAME; use crate::layers::ServiceBuilderExt; +use crate::metrics::aggregation::MeterProviderType; +use crate::metrics::filter::FilterMeterProvider; +use crate::metrics::meter_provider; use crate::plugin::Plugin; use crate::plugin::PluginInit; use crate::plugins::telemetry::apollo::ForwardHeaders; @@ -92,17 +91,16 @@ use crate::plugins::telemetry::config::Trace; use crate::plugins::telemetry::config::Tracing; use crate::plugins::telemetry::formatters::filter_metric_events; use crate::plugins::telemetry::formatters::FilteringFormatter; -use crate::plugins::telemetry::metrics::aggregation::AggregateMeterProvider; use crate::plugins::telemetry::metrics::apollo::studio::SingleContextualizedStats; use crate::plugins::telemetry::metrics::apollo::studio::SinglePathErrorStats; use crate::plugins::telemetry::metrics::apollo::studio::SingleQueryLatencyStats; use crate::plugins::telemetry::metrics::apollo::studio::SingleStats; use crate::plugins::telemetry::metrics::apollo::studio::SingleStatsReport; -use crate::plugins::telemetry::metrics::layer::MetricsLayer; -use crate::plugins::telemetry::metrics::BasicMetrics; +use crate::plugins::telemetry::metrics::prometheus::commit_prometheus; use crate::plugins::telemetry::metrics::MetricsBuilder; use crate::plugins::telemetry::metrics::MetricsConfigurator; -use crate::plugins::telemetry::metrics::MetricsExporterHandle; +use crate::plugins::telemetry::reload::metrics_layer; +use crate::plugins::telemetry::reload::OPENTELEMETRY_TRACER_HANDLE; use crate::plugins::telemetry::tracing::apollo_telemetry::decode_ftv1_trace; use crate::plugins::telemetry::tracing::apollo_telemetry::APOLLO_PRIVATE_OPERATION_SIGNATURE; use crate::plugins::telemetry::tracing::TracingConfigurator; @@ -148,7 +146,6 @@ pub(crate) const OPERATION_KIND: &str = "apollo_telemetry::operation_kind"; pub(crate) const STUDIO_EXCLUDE: &str = "apollo_telemetry::studio::exclude"; pub(crate) const LOGGING_DISPLAY_HEADERS: &str = "apollo_telemetry::logging::display_headers"; pub(crate) const LOGGING_DISPLAY_BODY: &str = "apollo_telemetry::logging::display_body"; -const DEFAULT_SERVICE_NAME: &str = "apollo-router"; const GLOBAL_TRACER_NAME: &str = "apollo-router"; const DEFAULT_EXPOSE_TRACE_ID_HEADER: &str = "apollo-trace-id"; static DEFAULT_EXPOSE_TRACE_ID_HEADER_NAME: HeaderName = @@ -159,18 +156,16 @@ static FTV1_HEADER_VALUE: HeaderValue = HeaderValue::from_static("ftv1"); #[doc(hidden)] // Only public for integration tests pub(crate) struct Telemetry { config: Arc, - metrics: BasicMetrics, - // Do not remove metrics_exporters. Metrics will not be exported if it is removed. - // Typically the handles are a PushController but may be something else. Dropping the handle will - // shutdown exporter. - metrics_exporters: Vec, custom_endpoints: MultiMap, apollo_metrics_sender: apollo_exporter::Sender, field_level_instrumentation_ratio: f64, sampling_filter_ratio: SamplerOption, tracer_provider: Option, - meter_provider: AggregateMeterProvider, + // We have to have separate meter providers for prometheus metrics so that they don't get zapped on router reload. + public_meter_provider: Option, + public_prometheus_meter_provider: Option, + private_meter_provider: Option, counter: Option>>, } @@ -209,36 +204,10 @@ fn setup_metrics_exporter( impl Drop for Telemetry { fn drop(&mut self) { - // If we can downcast the metrics exporter to be a `BasicController`, then we - // should stop it to ensure metrics are transmitted before the exporter is dropped. - for exporter in self.metrics_exporters.drain(..) { - if let Ok(controller) = MetricsExporterHandle::downcast::(exporter) { - ::tracing::debug!("stopping basic controller: {controller:?}"); - let cx = OtelContext::current(); - - thread::spawn(move || { - if let Err(e) = controller.stop(&cx) { - ::tracing::error!("error during basic controller stop: {e}"); - } - ::tracing::debug!("stopped basic controller: {controller:?}"); - }); - } - } - // If for some reason we didn't use the trace provider then safely discard it e.g. some other plugin failed `new` - // To ensure we don't hang tracing providers are dropped in a blocking task. - // https://github.com/open-telemetry/opentelemetry-rust/issues/868#issuecomment-1250387989 - // We don't have to worry about timeouts as every exporter is batched, which has a timeout on it already. - if let Some(tracer_provider) = self.tracer_provider.take() { - // If we have no runtime then we don't need to spawn a task as we are already in a blocking context. - if Handle::try_current().is_ok() { - // This is a thread for a reason! - // Tokio doesn't finish executing tasks before termination https://github.com/tokio-rs/tokio/issues/1156. - // This means that if the runtime is shutdown there is potentially a race where the provider may not be flushed. - // By using a thread it doesn't matter if the tokio runtime is shut down. - // This is likely to happen in tests due to the tokio runtime being destroyed when the test method exits. - thread::spawn(move || drop(tracer_provider)); - } - } + Self::safe_shutdown_meter_provider(&mut self.private_meter_provider); + Self::safe_shutdown_meter_provider(&mut self.public_meter_provider); + Self::safe_shutdown_meter_provider(&mut self.public_prometheus_meter_provider); + self.safe_shutown_tracer(); } } @@ -247,13 +216,16 @@ impl Plugin for Telemetry { type Config = config::Conf; async fn new(init: PluginInit) -> Result { + opentelemetry::global::set_error_handler(handle_error) + .expect("otel error handler lock poisoned, fatal"); + let config = init.config; config.logging.validate()?; let field_level_instrumentation_ratio = config.calculate_field_level_instrumentation_ratio()?; - let mut metrics_builder = Self::create_metrics_builder(&config)?; - let meter_provider = metrics_builder.meter_provider(); + let metrics_builder = Self::create_metrics_builder(&config)?; + let counter = config .metrics .as_ref() @@ -270,13 +242,17 @@ impl Plugin for Telemetry { let (sampling_filter_ratio, tracer_provider) = Self::create_tracer_provider(&config)?; Ok(Telemetry { - custom_endpoints: metrics_builder.custom_endpoints(), - metrics_exporters: metrics_builder.exporters(), - metrics: BasicMetrics::new(&meter_provider), - apollo_metrics_sender: metrics_builder.apollo_metrics_provider(), + custom_endpoints: metrics_builder.custom_endpoints, + apollo_metrics_sender: metrics_builder.apollo_metrics_sender, field_level_instrumentation_ratio, tracer_provider: Some(tracer_provider), - meter_provider, + public_meter_provider: Some(metrics_builder.public_meter_provider_builder.build()) + .map(FilterMeterProvider::public_metrics), + private_meter_provider: Some(metrics_builder.apollo_meter_provider_builder.build()) + .map(FilterMeterProvider::private_metrics), + public_prometheus_meter_provider: metrics_builder + .prometheus_meter_provider + .map(FilterMeterProvider::public_metrics), sampling_filter_ratio, config: Arc::new(config), counter, @@ -388,7 +364,6 @@ impl Plugin for Telemetry { fn supergraph_service(&self, service: supergraph::BoxService) -> supergraph::BoxService { let metrics_sender = self.apollo_metrics_sender.clone(); - let metrics = self.metrics.clone(); let config = self.config.clone(); let config_map_res_first = config.clone(); let config_map_res = config.clone(); @@ -443,7 +418,6 @@ impl Plugin for Telemetry { }, move |ctx: Context, fut| { let config = config_map_res.clone(); - let metrics = metrics.clone(); let sender = metrics_sender.clone(); let start = Instant::now(); @@ -452,13 +426,12 @@ impl Plugin for Telemetry { result = Self::update_otel_metrics( config.clone(), ctx.clone(), - metrics.clone(), result, start.elapsed(), ) .await; Self::update_metrics_on_response_events( - &ctx, config, field_level_instrumentation_ratio, metrics, sender, start, result, + &ctx, config, field_level_instrumentation_ratio, sender, start, result, ) } }, @@ -497,7 +470,6 @@ impl Plugin for Telemetry { } fn subgraph_service(&self, name: &str, service: subgraph::BoxService) -> subgraph::BoxService { - let metrics = self.metrics.clone(); let subgraph_attribute = KeyValue::new("subgraph", name.to_string()); let subgraph_metrics_conf_req = self.create_subgraph_metrics_conf(name); let subgraph_metrics_conf_resp = subgraph_metrics_conf_req.clone(); @@ -553,7 +525,6 @@ impl Plugin for Telemetry { }, move |(context, cache_attributes): (Context, Option), f: BoxFuture<'static, Result>| { - let metrics = metrics.clone(); let subgraph_attribute = subgraph_attribute.clone(); let subgraph_metrics_conf = subgraph_metrics_conf_resp.clone(); let counter = counter.clone(); @@ -562,7 +533,6 @@ impl Plugin for Telemetry { f.map(move |result: Result| { Self::store_subgraph_response_attributes( &context, - metrics, subgraph_attribute, subgraph_metrics_conf, now, @@ -602,6 +572,7 @@ impl Telemetry { let tracer = tracer_provider.versioned_tracer( GLOBAL_TRACER_NAME, Some(env!("CARGO_PKG_VERSION")), + None::, None, ); hot_tracer.reload(tracer); @@ -611,13 +582,12 @@ impl Telemetry { // https://github.com/open-telemetry/opentelemetry-rust/issues/868#issuecomment-1250387989 // We don't have to worry about timeouts as every exporter is batched, which has a timeout on it already. tokio::task::spawn_blocking(move || drop(last_provider)); - opentelemetry::global::set_error_handler(handle_error) - .expect("otel error handler lock poisoned, fatal"); opentelemetry::global::set_text_map_propagator(Self::create_propagator(&self.config)); } - reload_metrics(MetricsLayer::new(&self.meter_provider)); + self.reload_metrics(); + reload_fmt(Self::create_fmt_layer(&self.config)); } @@ -696,37 +666,12 @@ impl Telemetry { fn create_metrics_builder(config: &config::Conf) -> Result { let metrics_config = config.metrics.clone().unwrap_or_default(); - let metrics_common_config = &mut metrics_config.common.unwrap_or_default(); - // Set default service name for metrics - if metrics_common_config - .resources - .get(opentelemetry_semantic_conventions::resource::SERVICE_NAME.as_str()) - .is_none() - { - metrics_common_config.resources.insert( - String::from(opentelemetry_semantic_conventions::resource::SERVICE_NAME.as_str()), - String::from( - metrics_common_config - .service_name - .as_deref() - .unwrap_or(DEFAULT_SERVICE_NAME), - ), - ); - } - if let Some(service_namespace) = &metrics_common_config.service_namespace { - metrics_common_config.resources.insert( - String::from( - opentelemetry_semantic_conventions::resource::SERVICE_NAMESPACE.as_str(), - ), - service_namespace.clone(), - ); - } - - let mut builder = MetricsBuilder::default(); - builder = setup_metrics_exporter(builder, &config.apollo, metrics_common_config)?; + let metrics_common_config = metrics_config.common.unwrap_or_default().clone(); + let mut builder = MetricsBuilder::new(config); + builder = setup_metrics_exporter(builder, &config.apollo, &metrics_common_config)?; builder = - setup_metrics_exporter(builder, &metrics_config.prometheus, metrics_common_config)?; - builder = setup_metrics_exporter(builder, &metrics_config.otlp, metrics_common_config)?; + setup_metrics_exporter(builder, &metrics_config.prometheus, &metrics_common_config)?; + builder = setup_metrics_exporter(builder, &metrics_config.otlp, &metrics_common_config)?; Ok(builder) } @@ -838,7 +783,6 @@ impl Telemetry { async fn update_otel_metrics( config: Arc, context: Context, - metrics: BasicMetrics, result: Result, request_duration: Duration, ) -> Result { @@ -889,9 +833,11 @@ impl Telemetry { if !parts.status.is_success() { metric_attrs.push(KeyValue::new("error", parts.status.to_string())); } - ::tracing::info!( - monotonic_counter.apollo.router.operations = 1u64, - http.response.status_code = parts.status.as_u16() as i64, + u64_counter!( + "apollo.router.operations", + "The number of graphql operations performed by the Router", + 1, + "http.response.status_code" = parts.status.as_u16() as i64 ); let response = http::Response::from_parts( parts, @@ -904,26 +850,30 @@ impl Telemetry { } Err(err) => { metric_attrs.push(KeyValue::new("status", "500")); - - ::tracing::info!( - monotonic_counter.apollo.router.operations = 1u64, - http.response.status_code = 500i64, + u64_counter!( + "apollo.router.operations", + "The number of graphql operations performed by the Router", + 1, + "http.response.status_code" = 500 ); Err(err) } }; // http_requests_total - the total number of HTTP requests received - metrics - .http_requests_total - .add(&opentelemetry::Context::current(), 1, &metric_attrs); + u64_counter!( + "apollo_router_http_requests_total", + "Total number of HTTP requests made.", + 1, + metric_attrs + ); - metrics.http_requests_duration.record( - &opentelemetry::Context::current(), + f64_histogram!( + "apollo_router_http_request_duration_seconds", + "Duration of HTTP requests.", request_duration.as_secs_f64(), - &metric_attrs, + metric_attrs ); - res } @@ -1146,7 +1096,6 @@ impl Telemetry { #[allow(clippy::too_many_arguments)] fn store_subgraph_response_attributes( context: &Context, - metrics: BasicMetrics, subgraph_attribute: KeyValue, attribute_forward_config: Arc>, now: Instant, @@ -1215,10 +1164,11 @@ impl Telemetry { ); } - metrics.http_requests_total.add( - &opentelemetry::Context::current(), + u64_counter!( + "apollo_router_http_requests_total", + "Total number of HTTP requests made.", 1, - &metric_attrs, + metric_attrs ); } Err(err) => { @@ -1233,17 +1183,19 @@ impl Telemetry { ); } - metrics.http_requests_total.add( - &opentelemetry::Context::current(), + u64_counter!( + "apollo_router_http_requests_total", + "Total number of HTTP requests made.", 1, - &metric_attrs, + metric_attrs ); } } - metrics.http_requests_duration.record( - &opentelemetry::Context::current(), + f64_histogram!( + "apollo_router_http_request_duration_seconds", + "Duration of HTTP requests.", now.elapsed().as_secs_f64(), - &metric_attrs, + metric_attrs ); } @@ -1252,7 +1204,6 @@ impl Telemetry { ctx: &Context, config: Arc, field_level_instrumentation_ratio: f64, - metrics: BasicMetrics, sender: Sender, start: Instant, result: Result, @@ -1292,10 +1243,11 @@ impl Telemetry { ); } - metrics.http_requests_total.add( - &opentelemetry::Context::current(), + u64_counter!( + "apollo_router_http_requests_total", + "Total number of HTTP requests made.", 1, - &metric_attrs, + metric_attrs ); Err(e) @@ -1661,6 +1613,77 @@ impl Telemetry { ); } } + fn reload_metrics(&mut self) { + let meter_provider = meter_provider(); + if self.public_prometheus_meter_provider.is_some() { + commit_prometheus(); + } + let mut old_meter_providers = Vec::new(); + if let Some(old_provider) = meter_provider.set( + MeterProviderType::PublicPrometheus, + self.public_prometheus_meter_provider.take(), + ) { + old_meter_providers.push((MeterProviderType::PublicPrometheus, old_provider)); + } + + if let Some(old_provider) = meter_provider.set( + MeterProviderType::Apollo, + self.private_meter_provider.take(), + ) { + old_meter_providers.push((MeterProviderType::Apollo, old_provider)); + } + if let Some(old_provider) = + meter_provider.set(MeterProviderType::Public, self.public_meter_provider.take()) + { + old_meter_providers.push((MeterProviderType::Public, old_provider)); + } + + metrics_layer().clear(); + + // Old providers MUST be shut down in a blocking thread. + tokio::task::spawn_blocking(move || { + for (meter_provider_type, meter_provider) in old_meter_providers { + if let Err(e) = meter_provider.shutdown() { + ::tracing::error!(error = %e, meter_provider_type = ?meter_provider_type, "failed to shutdown meter provider") + } + } + }); + } + + fn safe_shutdown_meter_provider(meter_provider: &mut Option) { + if Handle::try_current().is_ok() { + if let Some(meter_provider) = meter_provider.take() { + // This is a thread for a reason! + // Tokio doesn't finish executing tasks before termination https://github.com/tokio-rs/tokio/issues/1156. + // This means that if the runtime is shutdown there is potentially a race where the provider may not be flushed. + // By using a thread it doesn't matter if the tokio runtime is shut down. + // This is likely to happen in tests due to the tokio runtime being destroyed when the test method exits. + thread::spawn(move || { + if let Err(e) = meter_provider.shutdown() { + ::tracing::error!(error = %e, "failed to shutdown meter provider") + } + }); + } + } + } + + fn safe_shutown_tracer(&mut self) { + // If for some reason we didn't use the trace provider then safely discard it e.g. some other plugin failed `new` + // To ensure we don't hang tracing providers are dropped in a blocking task. + // https://github.com/open-telemetry/opentelemetry-rust/issues/868#issuecomment-1250387989 + // We don't have to worry about timeouts as every exporter is batched, which has a timeout on it already. + if let Some(tracer_provider) = self.tracer_provider.take() { + // If we have no runtime then we don't need to spawn a task as we are already in a blocking context. + if Handle::try_current().is_ok() { + // This is a thread for a reason! + // Tokio doesn't finish executing tasks before termination https://github.com/tokio-rs/tokio/issues/1156. + // This means that if the runtime is shutdown there is potentially a race where the provider may not be flushed. + // By using a thread it doesn't matter if the tokio runtime is shut down. + // This is likely to happen in tests due to the tokio runtime being destroyed when the test method exits. + thread::spawn(move || drop(tracer_provider)); + } + } + } } #[derive(Debug, Clone)] @@ -1836,6 +1859,14 @@ fn handle_error>(err: T) { // We have to rate limit these errors because when they happen they are very frequent. // Use a dashmap to store the message type with the last time it was logged. let last_logged_map = OTEL_ERROR_LAST_LOGGED.get_or_init(DashMap::new); + + handle_error_internal(err, last_logged_map); +} + +fn handle_error_internal>( + err: T, + last_logged_map: &DashMap, +) { let err = err.into(); // We don't want the dashmap to get big, so we key the error messages by type. @@ -1866,7 +1897,9 @@ fn handle_error>(err: T) { ::tracing::error!("OpenTelemetry trace error occurred: {}", err) } opentelemetry::global::Error::Metric(err) => { - ::tracing::error!("OpenTelemetry metric error occurred: {}", err) + if err.to_string() != "Metrics error: reader is shut down or not registered" { + ::tracing::error!("OpenTelemetry metric error occurred: {}", err) + } } opentelemetry::global::Error::Other(err) => { ::tracing::error!("OpenTelemetry error occurred: {}", err) @@ -2003,12 +2036,12 @@ struct EnableSubgraphFtv1; mod tests { use std::fmt::Debug; use std::ops::DerefMut; - use std::str::FromStr; use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; use axum::headers::HeaderName; + use dashmap::DashMap; use http::HeaderMap; use http::HeaderValue; use http::StatusCode; @@ -2030,431 +2063,49 @@ mod tests { use tracing_subscriber::Layer; use super::apollo::ForwardHeaders; + use super::Telemetry; use crate::error::FetchError; use crate::graphql::Error; use crate::graphql::Request; use crate::http_ext; use crate::json_ext::Object; + use crate::metrics::FutureMetricsExt; use crate::plugin::test::MockSubgraphService; use crate::plugin::test::MockSupergraphService; use crate::plugin::DynPlugin; - use crate::plugins::telemetry::handle_error; + use crate::plugins::telemetry::handle_error_internal; use crate::services::SubgraphRequest; use crate::services::SubgraphResponse; use crate::services::SupergraphRequest; use crate::services::SupergraphResponse; - #[tokio::test(flavor = "multi_thread")] - async fn plugin_registered() { - crate::plugin::plugins() + async fn create_plugin_with_config(config: &str) -> Box { + let prometheus_support = config.contains("prometheus"); + let config: Value = serde_yaml::from_str(config).expect("yaml must be valid"); + let telemetry_config = config + .as_object() + .expect("must be an object") + .get("telemetry") + .expect("root key must be telemetry"); + let mut plugin = crate::plugin::plugins() .find(|factory| factory.name == "apollo.telemetry") .expect("Plugin not found") - .create_instance( - &serde_json::json!({"apollo": {"schema_id":"abc"}, "tracing": {}}), - Default::default(), - Default::default(), - ) + .create_instance(telemetry_config, Default::default(), Default::default()) .await .unwrap(); - } - #[tokio::test(flavor = "multi_thread")] - async fn attribute_serialization() { - crate::plugin::plugins() - .find(|factory| factory.name == "apollo.telemetry") - .expect("Plugin not found") - .create_instance( - &serde_json::json!({ - "apollo": {"schema_id":"abc"}, - "tracing": { - "trace_config": { - "service_name": "router", - "attributes": { - "str": "a", - "int": 1, - "float": 1.0, - "bool": true, - "str_arr": ["a", "b"], - "int_arr": [1, 2], - "float_arr": [1.0, 2.0], - "bool_arr": [true, false] - } - } - }, - "metrics": { - "common": { - "attributes": { - "supergraph": { - "static": [ - { - "name": "myname", - "value": "label_value" - } - ], - "request": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value" - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - }, - "response": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }, { - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - } - }, - "subgraph": { - "all": { - "static": [ - { - "name": "myname", - "value": "label_value" - } - ], - "request": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - }, - "response": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }, { - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - } - }, - "subgraphs": { - "subgraph_name_test": { - "static": [ - { - "name": "myname", - "value": "label_value" - } - ], - "request": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - }, - "response": { - "header": [{ - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }, { - "named": "test", - "default": "default_value", - "rename": "renamed_value", - }], - "body": [{ - "path": ".data.test", - "name": "my_new_name", - "default": "default_value" - }] - } - } - } - } - } - } - } - }), - Default::default(), - Default::default(), - ) - .await - .unwrap(); + if prometheus_support { + plugin + .as_any_mut() + .downcast_mut::() + .unwrap() + .reload_metrics(); + } + plugin } - #[tokio::test(flavor = "multi_thread")] - async fn it_test_prometheus_metrics() { - let mut mock_service = MockSupergraphService::new(); - mock_service - .expect_call() - .times(1) - .returning(move |req: SupergraphRequest| { - Ok(SupergraphResponse::fake_builder() - .context(req.context) - .header("x-custom", "coming_from_header") - .data(json!({"data": {"my_value": 2usize}})) - .build() - .unwrap()) - }); - - let mut mock_bad_request_service = MockSupergraphService::new(); - mock_bad_request_service - .expect_call() - .times(1) - .returning(move |req: SupergraphRequest| { - Ok(SupergraphResponse::fake_builder() - .context(req.context) - .status_code(StatusCode::BAD_REQUEST) - .data(json!({"errors": [{"message": "nope"}]})) - .build() - .unwrap()) - }); - - let mut mock_subgraph_service = MockSubgraphService::new(); - mock_subgraph_service - .expect_call() - .times(1) - .returning(move |req: SubgraphRequest| { - let mut extension = Object::new(); - extension.insert( - serde_json_bytes::ByteString::from("status"), - serde_json_bytes::Value::String(ByteString::from("INTERNAL_SERVER_ERROR")), - ); - let _ = req - .context - .insert("my_key", "my_custom_attribute_from_context".to_string()) - .unwrap(); - Ok(SubgraphResponse::fake_builder() - .context(req.context) - .error( - Error::builder() - .message(String::from("an error occured")) - .extensions(extension) - .extension_code("FETCH_ERROR") - .build(), - ) - .build()) - }); - - let mut mock_subgraph_service_in_error = MockSubgraphService::new(); - mock_subgraph_service_in_error - .expect_call() - .times(1) - .returning(move |_req: SubgraphRequest| { - Err(Box::new(FetchError::SubrequestHttpError { - status_code: None, - service: String::from("my_subgraph_name_error"), - reason: String::from("cannot contact the subgraph"), - })) - }); - - let dyn_plugin: Box = crate::plugin::plugins() - .find(|factory| factory.name == "apollo.telemetry") - .expect("Plugin not found") - .create_instance( - &Value::from_str( - r#"{ - "apollo": { - "client_name_header": "name_header", - "client_version_header": "version_header", - "schema_id": "schema_sha" - }, - "metrics": { - "common": { - "service_name": "apollo-router", - "attributes": { - "supergraph": { - "static": [ - { - "name": "myname", - "value": "label_value" - } - ], - "request": { - "header": [ - { - "named": "test", - "default": "default_value", - "rename": "renamed_value" - }, - { - "named": "another_test", - "default": "my_default_value" - } - ] - }, - "response": { - "header": [{ - "named": "x-custom" - }], - "body": [{ - "path": ".data.data.my_value", - "name": "my_value" - }] - } - }, - "subgraph": { - "all": { - "errors": { - "include_messages": true, - "extensions": [{ - "name": "subgraph_error_extended_code", - "path": ".code" - }, { - "name": "message", - "path": ".reason" - }] - } - }, - "subgraphs": { - "my_subgraph_name": { - "request": { - "body": [{ - "path": ".query", - "name": "query_from_request" - }, { - "path": ".data", - "name": "unknown_data", - "default": "default_value" - }, { - "path": ".data2", - "name": "unknown_data_bis" - }] - }, - "response": { - "body": [{ - "path": ".errors[0].extensions.status", - "name": "error" - }] - }, - "context": [ - { - "named": "my_key" - } - ] - } - } - } - } - }, - "prometheus": { - "enabled": true - } - } - }"#, - ) - .unwrap(), - Default::default(), - Default::default(), - ) - .await - .unwrap(); - let mut supergraph_service = dyn_plugin.supergraph_service(BoxService::new(mock_service)); - let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); - - let _router_response = supergraph_service - .ready() - .await - .unwrap() - .call(router_req.build().unwrap()) - .await - .unwrap() - .next_response() - .await - .unwrap(); - - let mut bad_request_supergraph_service = - dyn_plugin.supergraph_service(BoxService::new(mock_bad_request_service)); - let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); - - let _router_response = bad_request_supergraph_service - .ready() - .await - .unwrap() - .call(router_req.build().unwrap()) - .await - .unwrap() - .next_response() - .await - .unwrap(); - - let mut subgraph_service = - dyn_plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); - let subgraph_req = SubgraphRequest::fake_builder() - .subgraph_request( - http_ext::Request::fake_builder() - .header("test", "my_value_set") - .body( - Request::fake_builder() - .query(String::from("query { test }")) - .build(), - ) - .build() - .unwrap(), - ) - .build(); - let _subgraph_response = subgraph_service - .ready() - .await - .unwrap() - .call(subgraph_req) - .await - .unwrap(); - // Another subgraph - let mut subgraph_service = dyn_plugin.subgraph_service( - "my_subgraph_name_error", - BoxService::new(mock_subgraph_service_in_error), - ); - let subgraph_req = SubgraphRequest::fake_builder() - .subgraph_request( - http_ext::Request::fake_builder() - .header("test", "my_value_set") - .body( - Request::fake_builder() - .query(String::from("query { test }")) - .build(), - ) - .build() - .unwrap(), - ) - .build(); - let _subgraph_response = subgraph_service - .ready() - .await - .unwrap() - .call(subgraph_req) - .await - .expect_err("Must be in error"); - - let http_req_prom = http::Request::get("http://localhost:9090/WRONG/URL/metrics") - .body(Default::default()) - .unwrap(); - let mut web_endpoint = dyn_plugin + async fn get_prometheus_metrics(plugin: &dyn DynPlugin) -> String { + let web_endpoint = plugin .web_endpoints() .into_iter() .next() @@ -2464,14 +2115,6 @@ mod tests { .next() .unwrap() .into_router(); - let resp = web_endpoint - .ready() - .await - .unwrap() - .call(http_req_prom) - .await - .unwrap(); - assert_eq!(resp.status(), StatusCode::NOT_FOUND); let http_req_prom = http::Request::get("http://localhost:9090/metrics") .body(Default::default()) @@ -2479,17 +2122,15 @@ mod tests { let mut resp = web_endpoint.oneshot(http_req_prom).await.unwrap(); assert_eq!(resp.status(), StatusCode::OK); let body = hyper::body::to_bytes(resp.body_mut()).await.unwrap(); - let prom_metrics = String::from_utf8_lossy(&body) + String::from_utf8_lossy(&body) .to_string() .split('\n') - .filter(|l| l.contains("_count") && !l.contains("apollo_router_span_count")) + .filter(|l| l.contains("bucket") && !l.contains("apollo_router_span_count")) .sorted() - .join("\n"); - assert_snapshot!(prom_metrics); + .join("\n") } - #[tokio::test(flavor = "multi_thread")] - async fn it_test_prometheus_metrics_custom_buckets() { + async fn make_supergraph_request(plugin: &dyn DynPlugin) { let mut mock_service = MockSupergraphService::new(); mock_service .expect_call() @@ -2503,162 +2144,8 @@ mod tests { .unwrap()) }); - let mut mock_bad_request_service = MockSupergraphService::new(); - mock_bad_request_service - .expect_call() - .times(1) - .returning(move |req: SupergraphRequest| { - Ok(SupergraphResponse::fake_builder() - .context(req.context) - .status_code(StatusCode::BAD_REQUEST) - .data(json!({"errors": [{"message": "nope"}]})) - .build() - .unwrap()) - }); - - let mut mock_subgraph_service = MockSubgraphService::new(); - mock_subgraph_service - .expect_call() - .times(1) - .returning(move |req: SubgraphRequest| { - let mut extension = Object::new(); - extension.insert( - serde_json_bytes::ByteString::from("status"), - serde_json_bytes::Value::String(ByteString::from("INTERNAL_SERVER_ERROR")), - ); - let _ = req - .context - .insert("my_key", "my_custom_attribute_from_context".to_string()) - .unwrap(); - Ok(SubgraphResponse::fake_builder() - .context(req.context) - .error( - Error::builder() - .message(String::from("an error occured")) - .extensions(extension) - .extension_code("FETCH_ERROR") - .build(), - ) - .build()) - }); - - let mut mock_subgraph_service_in_error = MockSubgraphService::new(); - mock_subgraph_service_in_error - .expect_call() - .times(1) - .returning(move |_req: SubgraphRequest| { - Err(Box::new(FetchError::SubrequestHttpError { - status_code: None, - service: String::from("my_subgraph_name_error"), - reason: String::from("cannot contact the subgraph"), - })) - }); - - let dyn_plugin: Box = crate::plugin::plugins() - .find(|factory| factory.name == "apollo.telemetry") - .expect("Plugin not found") - .create_instance( - &Value::from_str( - r#"{ - "apollo": { - "client_name_header": "name_header", - "client_version_header": "version_header", - "schema_id": "schema_sha" - }, - "metrics": { - "common": { - "service_name": "apollo-router", - "buckets": [5.0, 10.0, 20.0], - "attributes": { - "supergraph": { - "static": [ - { - "name": "myname", - "value": "label_value" - } - ], - "request": { - "header": [ - { - "named": "test", - "default": "default_value", - "rename": "renamed_value" - }, - { - "named": "another_test", - "default": "my_default_value" - } - ] - }, - "response": { - "header": [{ - "named": "x-custom" - }], - "body": [{ - "path": ".data.data.my_value", - "name": "my_value" - }] - } - }, - "subgraph": { - "all": { - "errors": { - "include_messages": true, - "extensions": [{ - "name": "subgraph_error_extended_code", - "path": ".code" - }, { - "name": "message", - "path": ".reason" - }] - } - }, - "subgraphs": { - "my_subgraph_name": { - "request": { - "body": [{ - "path": ".query", - "name": "query_from_request" - }, { - "path": ".data", - "name": "unknown_data", - "default": "default_value" - }, { - "path": ".data2", - "name": "unknown_data_bis" - }] - }, - "response": { - "body": [{ - "path": ".errors[0].extensions.status", - "name": "error" - }] - }, - "context": [ - { - "named": "my_key" - } - ] - } - } - } - } - }, - "prometheus": { - "enabled": true - } - } - }"#, - ) - .unwrap(), - Default::default(), - Default::default(), - ) - .await - .unwrap(); - let mut supergraph_service = dyn_plugin.supergraph_service(BoxService::new(mock_service)); + let mut supergraph_service = plugin.supergraph_service(BoxService::new(mock_service)); let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); - let _router_response = supergraph_service .ready() .await @@ -2669,105 +2156,356 @@ mod tests { .next_response() .await .unwrap(); + } - let mut bad_request_supergraph_service = - dyn_plugin.supergraph_service(BoxService::new(mock_bad_request_service)); - let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); - - let _router_response = bad_request_supergraph_service - .ready() - .await - .unwrap() - .call(router_req.build().unwrap()) - .await - .unwrap() - .next_response() - .await - .unwrap(); - - let mut subgraph_service = - dyn_plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); - let subgraph_req = SubgraphRequest::fake_builder() - .subgraph_request( - http_ext::Request::fake_builder() - .header("test", "my_value_set") - .body( - Request::fake_builder() - .query(String::from("query { test }")) - .build(), - ) - .build() - .unwrap(), + #[tokio::test(flavor = "multi_thread")] + async fn plugin_registered() { + crate::plugin::plugins() + .find(|factory| factory.name == "apollo.telemetry") + .expect("Plugin not found") + .create_instance( + &serde_json::json!({"apollo": {"schema_id":"abc"}, "tracing": {}}), + Default::default(), + Default::default(), ) - .build(); - let _subgraph_response = subgraph_service - .ready() - .await - .unwrap() - .call(subgraph_req) .await .unwrap(); - // Another subgraph - let mut subgraph_service = dyn_plugin.subgraph_service( - "my_subgraph_name_error", - BoxService::new(mock_subgraph_service_in_error), - ); - let subgraph_req = SubgraphRequest::fake_builder() - .subgraph_request( - http_ext::Request::fake_builder() - .header("test", "my_value_set") - .body( - Request::fake_builder() - .query(String::from("query { test }")) - .build(), - ) - .build() - .unwrap(), - ) - .build(); - let _subgraph_response = subgraph_service - .ready() - .await - .unwrap() - .call(subgraph_req) - .await - .expect_err("Must be in error"); + } - let http_req_prom = http::Request::get("http://localhost:9090/WRONG/URL/metrics") - .body(Default::default()) - .unwrap(); - let mut web_endpoint = dyn_plugin - .web_endpoints() - .into_iter() - .next() - .unwrap() - .1 - .into_iter() - .next() - .unwrap() - .into_router(); - let resp = web_endpoint - .ready() - .await - .unwrap() - .call(http_req_prom) - .await - .unwrap(); - assert_eq!(resp.status(), StatusCode::NOT_FOUND); + #[tokio::test] + async fn config_serialization() { + create_plugin_with_config(include_str!("testdata/config.router.yaml")).await; + } - let http_req_prom = http::Request::get("http://localhost:9090/metrics") - .body(Default::default()) - .unwrap(); - let mut resp = web_endpoint.oneshot(http_req_prom).await.unwrap(); - assert_eq!(resp.status(), StatusCode::OK); - let body = hyper::body::to_bytes(resp.body_mut()).await.unwrap(); - let prom_metrics = String::from_utf8_lossy(&body) - .to_string() - .split('\n') - .filter(|l| l.contains("bucket") && !l.contains("apollo_router_span_count")) - .sorted() - .join("\n"); - assert_snapshot!(prom_metrics); + #[tokio::test] + async fn test_supergraph_metrics_ok() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/custom_attributes.router.yaml")) + .await; + make_supergraph_request(plugin.as_ref()).await; + + assert_counter!( + "apollo_router_http_requests_total", + 1, + "another_test" = "my_default_value", + "my_value" = 2, + "myname" = "label_value", + "renamed_value" = "my_value_set", + "status" = "200", + "x-custom" = "coming_from_header" + ); + assert_histogram!( + "apollo_router_http_request_duration_seconds", + 1, + "another_test" = "my_default_value", + "my_value" = 2, + "myname" = "label_value", + "renamed_value" = "my_value_set", + "status" = "200", + "x-custom" = "coming_from_header" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_supergraph_metrics_bad_request() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/custom_attributes.router.yaml")) + .await; + + let mut mock_bad_request_service = MockSupergraphService::new(); + mock_bad_request_service.expect_call().times(1).returning( + move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::BAD_REQUEST) + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }, + ); + let mut bad_request_supergraph_service = + plugin.supergraph_service(BoxService::new(mock_bad_request_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + let _router_response = bad_request_supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + assert_counter!( + "apollo_router_http_requests_total", + 1, + "another_test" = "my_default_value", + "error" = "400 Bad Request", + "myname" = "label_value", + "renamed_value" = "my_value_set", + "status" = "400" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_subgraph_metrics_ok() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/custom_attributes.router.yaml")) + .await; + + let mut mock_subgraph_service = MockSubgraphService::new(); + mock_subgraph_service + .expect_call() + .times(1) + .returning(move |req: SubgraphRequest| { + let mut extension = Object::new(); + extension.insert( + serde_json_bytes::ByteString::from("status"), + serde_json_bytes::Value::String(ByteString::from( + "custom_error_for_propagation", + )), + ); + let _ = req + .context + .insert("my_key", "my_custom_attribute_from_context".to_string()) + .unwrap(); + Ok(SubgraphResponse::fake_builder() + .context(req.context) + .error( + Error::builder() + .message(String::from("an error occured")) + .extensions(extension) + .extension_code("FETCH_ERROR") + .build(), + ) + .build()) + }); + + let mut subgraph_service = + plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .unwrap(); + + assert_counter!( + "apollo_router_http_requests_total", + 1, + "error" = "custom_error_for_propagation", + "my_key" = "my_custom_attribute_from_context", + "query_from_request" = "query { test }", + "status" = "200", + "subgraph" = "my_subgraph_name", + "unknown_data" = "default_value" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_subgraph_metrics_http_error() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/custom_attributes.router.yaml")) + .await; + + let mut mock_subgraph_service_in_error = MockSubgraphService::new(); + mock_subgraph_service_in_error + .expect_call() + .times(1) + .returning(move |_req: SubgraphRequest| { + Err(Box::new(FetchError::SubrequestHttpError { + status_code: None, + service: String::from("my_subgraph_name_error"), + reason: String::from("cannot contact the subgraph"), + })) + }); + + let mut subgraph_service = plugin.subgraph_service( + "my_subgraph_name_error", + BoxService::new(mock_subgraph_service_in_error), + ); + + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .expect_err("should be an error"); + + assert_counter!( + "apollo_router_http_requests_total", + 1, + "message" = "cannot contact the subgraph", + "status" = "500", + "subgraph" = "my_subgraph_name_error", + "subgraph_error_extended_code" = "SUBREQUEST_HTTP_ERROR" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_subgraph_metrics_bad_request() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/custom_attributes.router.yaml")) + .await; + + let mut mock_bad_request_service = MockSupergraphService::new(); + mock_bad_request_service.expect_call().times(1).returning( + move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::BAD_REQUEST) + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }, + ); + + let mut bad_request_supergraph_service = + plugin.supergraph_service(BoxService::new(mock_bad_request_service)); + + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = bad_request_supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + assert_counter!( + "apollo_router_http_requests_total", + 1, + "another_test" = "my_default_value", + "error" = "400 Bad Request", + "myname" = "label_value", + "renamed_value" = "my_value_set", + "status" = "400" + ); + assert_histogram!( + "apollo_router_http_request_duration_seconds", + 1, + "another_test" = "my_default_value", + "error" = "400 Bad Request", + "myname" = "label_value", + "renamed_value" = "my_value_set", + "status" = "400" + ); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn it_test_prometheus_wrong_endpoint() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/prometheus.router.yaml")).await; + + let mut web_endpoint = plugin + .web_endpoints() + .into_iter() + .next() + .unwrap() + .1 + .into_iter() + .next() + .unwrap() + .into_router(); + + let http_req_prom = http::Request::get("http://localhost:9090/WRONG/URL/metrics") + .body(Default::default()) + .unwrap(); + + let resp = web_endpoint + .ready() + .await + .unwrap() + .call(http_req_prom) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + .with_metrics() + .await; + } + + #[tokio::test(flavor = "multi_thread")] + async fn it_test_prometheus_metrics() { + async { + let plugin = + create_plugin_with_config(include_str!("testdata/prometheus.router.yaml")).await; + make_supergraph_request(plugin.as_ref()).await; + let prometheus_metrics = get_prometheus_metrics(plugin.as_ref()).await; + assert_snapshot!(prometheus_metrics); + } + .with_metrics() + .await; + } + + #[tokio::test(flavor = "multi_thread")] + async fn it_test_prometheus_metrics_custom_buckets() { + async { + let plugin = create_plugin_with_config(include_str!( + "testdata/prometheus_custom_buckets.router.yaml" + )) + .await; + make_supergraph_request(plugin.as_ref()).await; + let prometheus_metrics = get_prometheus_metrics(plugin.as_ref()).await; + + assert_snapshot!(prometheus_metrics); + } + .with_metrics() + .await; } #[test] @@ -2808,6 +2546,7 @@ mod tests { #[tokio::test] async fn test_handle_error_throttling() { + let error_map = DashMap::new(); // Set up a fake subscriber so we can check log events. If this is useful then maybe it can be factored out into something reusable #[derive(Default)] struct TestVisitor { @@ -2846,15 +2585,18 @@ mod tests { async { // Log twice rapidly, they should get deduped - handle_error(opentelemetry::global::Error::Other( - "other error".to_string(), - )); - handle_error(opentelemetry::global::Error::Other( - "other error".to_string(), - )); - handle_error(opentelemetry::global::Error::Trace( - "trace error".to_string().into(), - )); + handle_error_internal( + opentelemetry::global::Error::Other("other error".to_string()), + &error_map, + ); + handle_error_internal( + opentelemetry::global::Error::Other("other error".to_string()), + &error_map, + ); + handle_error_internal( + opentelemetry::global::Error::Trace("trace error".to_string().into()), + &error_map, + ); } .with_subscriber(tracing_subscriber::registry().with(test_layer.clone())) .await; @@ -2865,9 +2607,10 @@ mod tests { // Sleep a bit and then log again, it should get logged tokio::time::sleep(Duration::from_millis(200)).await; async { - handle_error(opentelemetry::global::Error::Other( - "other error".to_string(), - )); + handle_error_internal( + opentelemetry::global::Error::Other("other error".to_string()), + &error_map, + ); } .with_subscriber(tracing_subscriber::registry().with(test_layer.clone())) .await; diff --git a/apollo-router/src/plugins/telemetry/otlp.rs b/apollo-router/src/plugins/telemetry/otlp.rs index 5c522721c9..28df409585 100644 --- a/apollo-router/src/plugins/telemetry/otlp.rs +++ b/apollo-router/src/plugins/telemetry/otlp.rs @@ -3,6 +3,8 @@ use std::collections::HashMap; use indexmap::map::Entry; use indexmap::IndexMap; +use opentelemetry::sdk::metrics::reader::TemporalitySelector; +use opentelemetry::sdk::metrics::InstrumentKind; use opentelemetry_otlp::HttpExporterBuilder; use opentelemetry_otlp::TonicExporterBuilder; use opentelemetry_otlp::WithExportConfig; @@ -218,6 +220,29 @@ pub(crate) enum Temporality { Delta, } +pub(crate) struct CustomTemporalitySelector( + pub(crate) opentelemetry::sdk::metrics::data::Temporality, +); + +impl TemporalitySelector for CustomTemporalitySelector { + fn temporality(&self, _kind: InstrumentKind) -> opentelemetry::sdk::metrics::data::Temporality { + self.0 + } +} + +impl From<&Temporality> for Box { + fn from(value: &Temporality) -> Self { + Box::new(match value { + Temporality::Cumulative => CustomTemporalitySelector( + opentelemetry::sdk::metrics::data::Temporality::Cumulative, + ), + Temporality::Delta => { + CustomTemporalitySelector(opentelemetry::sdk::metrics::data::Temporality::Delta) + } + }) + } +} + mod metadata_map_serde { use tonic::metadata::KeyAndValueRef; use tonic::metadata::MetadataKey; diff --git a/apollo-router/src/plugins/telemetry/reload.rs b/apollo-router/src/plugins/telemetry/reload.rs index 2dcf270f41..d24468d2ac 100644 --- a/apollo-router/src/plugins/telemetry/reload.rs +++ b/apollo-router/src/plugins/telemetry/reload.rs @@ -5,7 +5,6 @@ use std::sync::atomic::Ordering; use anyhow::anyhow; use anyhow::Result; use once_cell::sync::OnceCell; -use opentelemetry::metrics::noop::NoopMeterProvider; use opentelemetry::sdk::trace::Tracer; use opentelemetry::trace::TraceContextExt; use opentelemetry::trace::TracerProvider; @@ -30,11 +29,11 @@ use tracing_subscriber::Registry; use super::config::SamplerOption; use super::metrics::span_metrics_exporter::SpanMetricsLayer; use crate::axum_factory::utils::REQUEST_SPAN_NAME; +use crate::metrics::layer::MetricsLayer; +use crate::metrics::meter_provider; use crate::plugins::telemetry::formatters::filter_metric_events; use crate::plugins::telemetry::formatters::text::TextFormatter; use crate::plugins::telemetry::formatters::FilteringFormatter; -use crate::plugins::telemetry::metrics; -use crate::plugins::telemetry::metrics::layer::MetricsLayer; use crate::plugins::telemetry::tracing::reload::ReloadTracer; pub(crate) type LayeredRegistry = Layered; @@ -54,29 +53,25 @@ pub(super) static OPENTELEMETRY_TRACER_HANDLE: OnceCell< ReloadTracer, > = OnceCell::new(); -#[allow(clippy::type_complexity)] -static METRICS_LAYER_HANDLE: OnceCell< - Handle< - MetricsLayer, - Layered< - tracing_subscriber::reload::Layer< - Box + Send + Sync>, - LayeredTracer, - >, - LayeredTracer, - >, - >, -> = OnceCell::new(); - static FMT_LAYER_HANDLE: OnceCell< Handle + Send + Sync>, LayeredTracer>, > = OnceCell::new(); pub(super) static SPAN_SAMPLING_RATE: AtomicU64 = AtomicU64::new(0); +pub(super) static METRICS_LAYER: OnceCell = OnceCell::new(); +pub(crate) fn metrics_layer() -> &'static MetricsLayer { + METRICS_LAYER.get_or_init(|| MetricsLayer::new(meter_provider().clone())) +} + pub(crate) fn init_telemetry(log_level: &str) -> Result<()> { let hot_tracer = ReloadTracer::new( - opentelemetry::sdk::trace::TracerProvider::default().versioned_tracer("noop", None, None), + opentelemetry::sdk::trace::TracerProvider::default().versioned_tracer( + "noop", + None::, + None::, + None, + ), ); let opentelemetry_layer = tracing_opentelemetry::layer() .with_tracer(hot_tracer.clone()) @@ -112,8 +107,7 @@ pub(crate) fn init_telemetry(log_level: &str) -> Result<()> { let (fmt_layer, fmt_handle) = tracing_subscriber::reload::Layer::new(fmt); - let (metrics_layer, metrics_handle) = - tracing_subscriber::reload::Layer::new(MetricsLayer::new(&NoopMeterProvider::default())); + let metrics_layer = metrics_layer(); // Stash the reload handles so that we can hot reload later OPENTELEMETRY_TRACER_HANDLE @@ -127,16 +121,13 @@ pub(crate) fn init_telemetry(log_level: &str) -> Result<()> { .with(SpanMetricsLayer::default()) .with(opentelemetry_layer) .with(fmt_layer) - .with(metrics_layer) + .with(metrics_layer.clone()) .with(EnvFilter::try_new(log_level)?) .try_init()?; Ok(hot_tracer) }) .map_err(|e: BoxError| anyhow!("failed to set OpenTelemetry tracer: {e}"))?; - METRICS_LAYER_HANDLE - .set(metrics_handle) - .map_err(|_| anyhow!("failed to set metrics layer handle"))?; FMT_LAYER_HANDLE .set(fmt_handle) .map_err(|_| anyhow!("failed to set fmt layer handle"))?; @@ -144,16 +135,6 @@ pub(crate) fn init_telemetry(log_level: &str) -> Result<()> { Ok(()) } -pub(super) fn reload_metrics(layer: MetricsLayer) { - if let Some(handle) = METRICS_LAYER_HANDLE.get() { - // If we are now going live with a new controller then maybe stash it. - metrics::prometheus::commit_new_controller(); - handle - .reload(layer) - .expect("metrics layer reload must succeed"); - } -} - pub(super) fn reload_fmt(layer: Box + Send + Sync>) { if let Some(handle) = FMT_LAYER_HANDLE.get() { handle.reload(layer).expect("fmt layer reload must succeed"); diff --git a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics.snap b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics.snap index 4608c849dc..eae5de460a 100644 --- a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics.snap +++ b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics.snap @@ -1,8 +1,17 @@ --- source: apollo-router/src/plugins/telemetry/mod.rs -expression: prom_metrics +expression: prometheus_metrics --- -apollo_router_http_request_duration_seconds_count{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",otel_scope_name="apollo/router",otel_scope_version=""} 1 -apollo_router_http_request_duration_seconds_count{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",otel_scope_name="apollo/router",otel_scope_version=""} 1 -apollo_router_http_request_duration_seconds_count{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",otel_scope_name="apollo/router",otel_scope_version=""} 1 -apollo_router_http_request_duration_seconds_count{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",otel_scope_name="apollo/router",otel_scope_version=""} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.001"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.005"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.015"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.05"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.1"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.2"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.3"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.4"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="0.5"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="1"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="5"} 1 diff --git a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap index eecbdbd9fe..3f346c2ad6 100644 --- a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap +++ b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap @@ -1,20 +1,8 @@ --- source: apollo-router/src/plugins/telemetry/mod.rs -expression: prom_metrics +expression: prometheus_metrics --- -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1 -apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1 -apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1 -apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1 -apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1 -apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1 -apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1 -apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1 -apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1 -apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{status="200",otel_scope_name="apollo/router",le="5"} 1 diff --git a/apollo-router/src/plugins/telemetry/testdata/config.router.yaml b/apollo-router/src/plugins/telemetry/testdata/config.router.yaml new file mode 100644 index 0000000000..598d8d80ed --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/config.router.yaml @@ -0,0 +1,101 @@ +telemetry: + tracing: + trace_config: + service_name: router + attributes: + str: a + int: 1 + float: 1 + bool: true + str_arr: + - a + - b + int_arr: + - 1 + - 2 + float_arr: + - 1 + - 2 + bool_arr: + - true + - false + metrics: + common: + attributes: + supergraph: + static: + - name: myname + value: label_value + request: + header: + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value + response: + header: + - named: test + default: default_value + rename: renamed_value + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value + subgraph: + all: + static: + - name: myname + value: label_value + request: + header: + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value + response: + header: + - named: test + default: default_value + rename: renamed_value + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value + subgraphs: + subgraph_name_test: + static: + - name: myname + value: label_value + request: + header: + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value + response: + header: + - named: test + default: default_value + rename: renamed_value + - named: test + default: default_value + rename: renamed_value + body: + - path: .data.test + name: my_new_name + default: default_value diff --git a/apollo-router/src/plugins/telemetry/testdata/custom_attributes.router.yaml b/apollo-router/src/plugins/telemetry/testdata/custom_attributes.router.yaml new file mode 100644 index 0000000000..4310d122d1 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/custom_attributes.router.yaml @@ -0,0 +1,51 @@ +telemetry: + apollo: + client_name_header: name_header + client_version_header: version_header + metrics: + common: + service_name: apollo-router + attributes: + supergraph: + static: + - name: myname + value: label_value + request: + header: + - named: test + default: default_value + rename: renamed_value + - named: another_test + default: my_default_value + response: + header: + - named: x-custom + body: + - path: .data.data.my_value + name: my_value + subgraph: + all: + errors: + include_messages: true + extensions: + - name: subgraph_error_extended_code + path: .code + - name: message + path: .reason + subgraphs: + my_subgraph_name: + request: + body: + - path: .query + name: query_from_request + - path: .data + name: unknown_data + default: default_value + - path: .data2 + name: unknown_data_bis + response: + body: + - path: .errors[0].extensions.status + name: error + context: + - named: my_key diff --git a/apollo-router/src/plugins/telemetry/testdata/prometheus.router.yaml b/apollo-router/src/plugins/telemetry/testdata/prometheus.router.yaml new file mode 100644 index 0000000000..47637813ab --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/prometheus.router.yaml @@ -0,0 +1,7 @@ +telemetry: + apollo: + client_name_header: name_header + client_version_header: version_header + metrics: + prometheus: + enabled: true diff --git a/apollo-router/src/plugins/telemetry/testdata/prometheus_custom_buckets.router.yaml b/apollo-router/src/plugins/telemetry/testdata/prometheus_custom_buckets.router.yaml new file mode 100644 index 0000000000..e96a4b5d71 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/prometheus_custom_buckets.router.yaml @@ -0,0 +1,13 @@ +telemetry: + apollo: + client_name_header: name_header + client_version_header: version_header + metrics: + common: + service_name: apollo-router + buckets: + - 5 + - 10 + - 20 + prometheus: + enabled: true diff --git a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs index 986a119113..8bf8ab044e 100644 --- a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs +++ b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs @@ -23,7 +23,6 @@ use opentelemetry::trace::SpanId; use opentelemetry::trace::TraceError; use opentelemetry::Key; use opentelemetry::Value; -use opentelemetry_semantic_conventions::trace::HTTP_METHOD; use prost::Message; use serde::de::DeserializeOwned; use thiserror::Error; @@ -74,6 +73,8 @@ use crate::query_planner::FLATTEN_SPAN_NAME; use crate::query_planner::PARALLEL_SPAN_NAME; use crate::query_planner::SEQUENCE_SPAN_NAME; +// TODO Remove this and use otel constants again https://github.com/apollographql/router/issues/3833 +const HTTP_METHOD: Key = Key::from_static_str("http.method"); const APOLLO_PRIVATE_REQUEST: Key = Key::from_static_str("apollo_private.request"); pub(crate) const APOLLO_PRIVATE_DURATION_NS: &str = "apollo_private.duration_ns"; const APOLLO_PRIVATE_DURATION_NS_KEY: Key = Key::from_static_str(APOLLO_PRIVATE_DURATION_NS); diff --git a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs index d7650c76b7..ac123a0e6c 100644 --- a/apollo-router/src/plugins/telemetry/tracing/jaeger.rs +++ b/apollo-router/src/plugins/telemetry/tracing/jaeger.rs @@ -1,14 +1,9 @@ //! Configuration for jaeger tracing. use std::fmt::Debug; -use opentelemetry::sdk::export::trace::SpanData; +use opentelemetry::runtime; use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; -use opentelemetry::sdk::trace::Span; -use opentelemetry::sdk::trace::SpanProcessor; -use opentelemetry::sdk::trace::TracerProvider; -use opentelemetry::trace::TraceResult; -use opentelemetry::Context; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -102,7 +97,7 @@ impl TracingConfigurator for Config { tracing::info!("Configuring Jaeger tracing: {}", batch_processor); // We are waiting for a release of https://github.com/open-telemetry/opentelemetry-rust/issues/894 // Until that time we need to wrap a tracer provider with Jeager in. - let tracer_provider = opentelemetry_jaeger::new_collector_pipeline() + let exporter = opentelemetry_jaeger::new_collector_pipeline() .with_trace_config(trace_config.into()) .with_service_name(trace_config.service_name.clone()) .with(&collector.username, |b, u| b.with_username(u)) @@ -110,34 +105,13 @@ impl TracingConfigurator for Config { .with_endpoint(&collector.endpoint.to_string()) .with_reqwest() .with_batch_processor_config(batch_processor.clone().into()) - .build_batch(opentelemetry::runtime::Tokio)?; - Ok(builder - .with_span_processor(DelegateSpanProcessor { tracer_provider }.filtered())) + .build_collector_exporter::()?; + Ok(builder.with_span_processor( + BatchSpanProcessor::builder(exporter, opentelemetry::runtime::Tokio) + .with_batch_config(batch_processor.clone().into()) + .build(), + )) } } } } - -#[derive(Debug)] -struct DelegateSpanProcessor { - tracer_provider: TracerProvider, -} - -impl SpanProcessor for DelegateSpanProcessor { - fn on_start(&self, span: &mut Span, cx: &Context) { - self.tracer_provider.span_processors()[0].on_start(span, cx) - } - - fn on_end(&self, span: SpanData) { - self.tracer_provider.span_processors()[0].on_end(span) - } - - fn force_flush(&self) -> TraceResult<()> { - self.tracer_provider.span_processors()[0].force_flush() - } - - fn shutdown(&mut self) -> TraceResult<()> { - // It's safe to not call shutdown as dropping tracer_provider will cause shutdown to happen separately. - Ok(()) - } -} diff --git a/apollo-router/src/plugins/telemetry/tracing/reload.rs b/apollo-router/src/plugins/telemetry/tracing/reload.rs index fcf7d1a395..7c3aa1a5c0 100644 --- a/apollo-router/src/plugins/telemetry/tracing/reload.rs +++ b/apollo-router/src/plugins/telemetry/tracing/reload.rs @@ -22,14 +22,14 @@ impl PreSampledTracer for ReloadTracer { .sampled_context(data) } - fn new_trace_id(&self) -> opentelemetry::trace::TraceId { + fn new_trace_id(&self) -> opentelemetry_api::trace::TraceId { self.parent .read() .expect("parent tracer must be available") .new_trace_id() } - fn new_span_id(&self) -> opentelemetry::trace::SpanId { + fn new_span_id(&self) -> opentelemetry_api::trace::SpanId { self.parent .read() .expect("parent tracer must be available") diff --git a/apollo-router/src/tracer.rs b/apollo-router/src/tracer.rs index 2084f4be9b..783254bdcc 100644 --- a/apollo-router/src/tracer.rs +++ b/apollo-router/src/tracer.rs @@ -54,7 +54,7 @@ mod test { use std::sync::Mutex; use once_cell::sync::Lazy; - use opentelemetry::sdk::export::trace::stdout; + use opentelemetry_api::trace::TracerProvider; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Registry; @@ -85,13 +85,22 @@ mod test { assert!(other_id == my_id); } - #[test] - fn it_returns_valid_trace_id() { + #[tokio::test] + async fn it_returns_valid_trace_id() { let _guard = TRACING_LOCK .lock() .unwrap_or_else(|poisoned| poisoned.into_inner()); // Create a tracing layer with the configured tracer - let tracer = stdout::new_pipeline().install_simple(); + + let provider = opentelemetry::sdk::trace::TracerProvider::builder() + .with_simple_exporter( + opentelemetry_stdout::SpanExporter::builder() + .with_writer(std::io::stdout()) + .build(), + ) + .build(); + let tracer = provider.versioned_tracer("noop", None::, None::, None); + let telemetry = tracing_opentelemetry::layer().with_tracer(tracer); // Use the tracing subscriber `Registry`, or any other subscriber // that impls `LookupSpan` @@ -112,7 +121,10 @@ mod test { let my_id = TraceId::maybe_new(); assert!(my_id.is_none()); // Create a tracing layer with the configured tracer - let tracer = stdout::new_pipeline().install_simple(); + let provider = opentelemetry::sdk::trace::TracerProvider::builder() + .with_simple_exporter(opentelemetry_stdout::SpanExporter::default()) + .build(); + let tracer = provider.versioned_tracer("noop", None::, None::, None); let telemetry = tracing_opentelemetry::layer().with_tracer(tracer); // Use the tracing subscriber `Registry`, or any other subscriber // that impls `LookupSpan` @@ -134,7 +146,10 @@ mod test { .lock() .unwrap_or_else(|poisoned| poisoned.into_inner()); // Create a tracing layer with the configured tracer - let tracer = stdout::new_pipeline().install_simple(); + let provider = opentelemetry::sdk::trace::TracerProvider::builder() + .with_simple_exporter(opentelemetry_stdout::SpanExporter::default()) + .build(); + let tracer = provider.versioned_tracer("noop", None::, None::, None); let telemetry = tracing_opentelemetry::layer().with_tracer(tracer); // Use the tracing subscriber `Registry`, or any other subscriber // that impls `LookupSpan` diff --git a/apollo-router/tests/metrics_tests.rs b/apollo-router/tests/metrics_tests.rs index ff089008fd..5dc48a2aed 100644 --- a/apollo-router/tests/metrics_tests.rs +++ b/apollo-router/tests/metrics_tests.rs @@ -43,8 +43,8 @@ async fn test_metrics_reloading() -> Result<(), BoxError> { router.assert_reloaded().await; } - router.assert_metrics_contains(r#"apollo_router_cache_hit_count_total{kind="query planner",service_name="apollo-router",storage="memory",otel_scope_name="apollo/router",otel_scope_version=""} 4"#, None).await; - router.assert_metrics_contains(r#"apollo_router_cache_miss_count_total{kind="query planner",service_name="apollo-router",storage="memory",otel_scope_name="apollo/router",otel_scope_version=""} 2"#, None).await; + router.assert_metrics_contains(r#"apollo_router_cache_hit_count_total{kind="query planner",storage="memory",otel_scope_name="apollo/router"} 4"#, None).await; + router.assert_metrics_contains(r#"apollo_router_cache_miss_count_total{kind="query planner",storage="memory",otel_scope_name="apollo/router"} 2"#, None).await; router .assert_metrics_contains(r#"apollo_router_cache_hit_time"#, None) .await; @@ -65,8 +65,8 @@ async fn test_metrics_reloading() -> Result<(), BoxError> { .await; if std::env::var("APOLLO_KEY").is_ok() && std::env::var("APOLLO_GRAPH_REF").is_ok() { - router.assert_metrics_contains(r#"apollo_router_uplink_fetch_duration_seconds_count{kind="unchanged",query="License",service_name="apollo-router",url="https://uplink.api.apollographql.com/",otel_scope_name="apollo/router",otel_scope_version=""}"#, Some(Duration::from_secs(120))).await; - router.assert_metrics_contains(r#"apollo_router_uplink_fetch_count_total{query="License",service_name="apollo-router",status="success",otel_scope_name="apollo/router",otel_scope_version=""}"#, Some(Duration::from_secs(1))).await; + router.assert_metrics_contains(r#"apollo_router_uplink_fetch_duration_seconds_count{kind="unchanged",query="License",url="https://uplink.api.apollographql.com/",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(120))).await; + router.assert_metrics_contains(r#"apollo_router_uplink_fetch_count_total{query="License",status="success",otel_scope_name="apollo/router"}"#, Some(Duration::from_secs(1))).await; } Ok(()) @@ -107,7 +107,7 @@ async fn test_subgraph_auth_metrics() -> Result<(), BoxError> { .unwrap() ); - router.assert_metrics_contains(r#"apollo_router_operations_authentication_aws_sigv4_total{authentication_aws_sigv4_failed="false",service_name="apollo-router",subgraph_service_name="products",otel_scope_name="apollo/router",otel_scope_version=""} 2"#, None).await; + router.assert_metrics_contains(r#"apollo_router_operations_authentication_aws_sigv4_total{authentication_aws_sigv4_failed="false",subgraph_service_name="products",otel_scope_name="apollo/router"} 2"#, None).await; Ok(()) } diff --git a/apollo-router/tests/snapshots/tracing_tests__traced_basic_composition.snap b/apollo-router/tests/snapshots/tracing_tests__traced_basic_composition.snap index 8c976a59d0..cf291813c3 100644 --- a/apollo-router/tests/snapshots/tracing_tests__traced_basic_composition.snap +++ b/apollo-router/tests/snapshots/tracing_tests__traced_basic_composition.snap @@ -200,14 +200,6 @@ expression: get_spans() [ "apollo_private.operation_signature", "# -\n{topProducts{name reviews{author{id name}id product{name}}upc}}" - ], - [ - "monotonic_counter.apollo.router.operations", - 1 - ], - [ - "http.response.status_code", - 200 ] ], "metadata": { diff --git a/apollo-router/tests/snapshots/tracing_tests__traced_basic_request.snap b/apollo-router/tests/snapshots/tracing_tests__traced_basic_request.snap index e5c11265d4..bdb77d329c 100644 --- a/apollo-router/tests/snapshots/tracing_tests__traced_basic_request.snap +++ b/apollo-router/tests/snapshots/tracing_tests__traced_basic_request.snap @@ -200,14 +200,6 @@ expression: get_spans() [ "apollo_private.operation_signature", "# -\n{topProducts{name name}}" - ], - [ - "monotonic_counter.apollo.router.operations", - 1 - ], - [ - "http.response.status_code", - 200 ] ], "metadata": { diff --git a/apollo-router/tests/snapshots/tracing_tests__variables.snap b/apollo-router/tests/snapshots/tracing_tests__variables.snap index 6ea9d10fa8..0d300d41b0 100644 --- a/apollo-router/tests/snapshots/tracing_tests__variables.snap +++ b/apollo-router/tests/snapshots/tracing_tests__variables.snap @@ -188,14 +188,6 @@ expression: get_spans() [ "apollo_private.operation_signature", "# ExampleQuery\nquery ExampleQuery($reviewsForAuthorAuthorId:ID!,$topProductsFirst:Int){topProducts(first:$topProductsFirst){name reviewsForAuthor(authorID:$reviewsForAuthorAuthorId){author{id name}body}}}" - ], - [ - "monotonic_counter.apollo.router.operations", - 1 - ], - [ - "http.response.status_code", - 400 ] ], "metadata": { diff --git a/dev-docs/metrics.md b/dev-docs/metrics.md new file mode 100644 index 0000000000..34530201ef --- /dev/null +++ b/dev-docs/metrics.md @@ -0,0 +1,181 @@ +# Metrics + +The Router uses OpenTelemetry metrics to support Prometheus and OTLP exporters. + +## Requirements +* Filtering of metrics to Public and Private exporters. This is to support Apollo only metrics and to exclude sending of legacy metrics to Apollo. +* Multiple exporters - Prometheus and OTLP. +* Prometheus metrics must persist across reloads. +* Metrics must be testable. + +## Entities +```mermaid + +erDiagram + + callsite-tracing ||--|{ metrics-layer : uses + callsite-macro ||--|{ aggregate-meter-provider : uses + callsite-macro ||--|{ instrument : mutates + + metrics-layer ||--|| aggregate-meter-provider : uses + metrics-layer ||--|{ instrument : mutates + + telemetry-plugin ||--|| metrics-layer : clears + telemetry-plugin ||--|| aggregate-meter-provider : configures + + aggregate-meter-provider ||--|| public-filtered-meter-provider : uses + aggregate-meter-provider ||--|| public-filtered-prometheus-meter-provider : uses + aggregate-meter-provider ||--|| private-filtered-meter-provider : uses + + public-filtered-meter-provider ||--|{ public-meter-provider : uses + public-filtered-prometheus-meter-provider ||--|{ public-prometheus-meter-provider : uses + private-filtered-meter-provider ||--|{ private-meter-provider : uses + + public-meter-provider ||--|{ public-meter : creates + public-prometheus-meter-provider ||--|{ public-prometheus-meter : creates + private-meter-provider ||--|{ private-meter : creates + + public-meter ||--|{ instrument : creates + public-prometheus-meter ||--|{ instrument : creates + private-meter ||--|{ instrument : creates + + instrument + + "exporter(s)" ||--|{ public-meter : observes + prometheus-exporter ||--|{ public-prometheus-meter : observes + prometheus-registry ||--|| prometheus-exporter : observes + private-otlp-exporter ||--|{ private-meter : observes + +``` + +### Instrument +A histogram, counter or gauge that is used to record metrics. + +### Meter +Creates instruments, also contains a reference to exporters so that when instruments are created the +* __Public meter__ - Exports to all public metrics to configured exporters except for Prometheus. +* __Public prometheus meter__ - Exports to all public metrics to Prometheus. +* __Private meter__ - Exports to all public metrics to Apollo. + + +### Meter provider +Creates meters +* __Public meter provider__ - Creates public meters (see above). +* __Public prometheus meter provider__ - Creates public prometheus meters (see above). +* __Private meter provider__ - Creates private meters (see above). + +### Filter meter provider +Depending on a meter name will return no-op or delegate to a meter provider. Used to filter public vs private metrics. + +### Aggregate meter provider +A meter provider that wraps public, public prometheus, and private meter providers. Used to create a single meter provider that can be used by the metrics layer and metrics macros. +This meter provider is also responsible for maintaining a strong reference to all instruments that are currently valid. This enables [callsite instrument caching](#callsite-instrument-caching). + +### Metrics layer +The tracing-opentelemetry layer that is used to create instruments and meters. This will cache instruments after they have been created. + +### Metrics macros +New macros that will be used for metrics going forward. Allows unit testing of metrics. + +### Prometheus registry +Used to render prometheus metrics. Contains no state. + +## Design gotchas +The metrics code is substantial, however there are reasons that it is structured in the way that it is. + +1. There is no way to filter instruments at the exporter level. This is the reason that we have aggregate meter providers that wrap the public, public prometheus, and private meter providers. This allows us to filter out private metrics at the meter provider level. +2. The meter provider and meter layer are both globals. This has made testing hard. The new metrics macros should be used as they have built in support for testing by moving the meter provider to a task or thread local. +3. Prometheus meters need to be kept around across reloads otherwise metrics are reset. This is why the aggregate meter provider allows internal mutability. + +## Using metrics macros + +Metrics macros are a replacement for the tracing-opentelemetry metrics-layer. +They are highly optimised, allow dynamic attributes, are easy to use and support unit testing. + +### Usage + +There are two classes of instrument, observable and non-observable. Observable instruments will ask for their value when they are exported, non-observable will update at the point of mutation. + +Observable gauges are attached to a particular meter, so they MUST be created after the telemetry plugin `activate()` has been called as this is the point where meters will updated. +We're going to have to think about how to make this less brittle. + +```rust +// non-observable instruments - good for histograms and counters +u64_counter!("test", "test description", 1, vec![KeyValue::new("attr", "val")]); +u64_counter!("test", "test description", 1, "attr" => "val"); +u64_counter!("test", "test description", 1); + +// observable instruments - good for gauges +meter_provider() + .meter("test") + .u64_observable_gauge("test") + .with_callback(|m| m.observe(5, &[])) + .init(); +``` + +### Testing +When using the macro in a test you will need a different pattern depending on if you are writing a sync or async test. + +#### Testing Sync +```rust + #[test] + fn test_non_async() { + // Each test is run in a separate thread, metrics are stored in a thread local. + u64_counter!("test", "test description", 1, "attr" => "val"); + assert_counter!("test", 1, "attr" => "val"); + } +``` + +#### Testing Async + +Make sure to use `.with_metrics()` method on the async block to ensure that the metrics are stored in a task local. +*Tests will silently fail to record metrics if this is not done.* +```rust + #[tokio::test(flavor = "multi_thread")] + async fn test_async_multi() { + // Multi-threaded runtime needs to use a tokio task local to avoid tests interfering with each other + async { + u64_counter!("test", "test description", 1, "attr" => "val"); + assert_counter!("test", 1, "attr" => "val"); + } + .with_metrics() + .await; + } + + #[tokio::test] + async fn test_async_single() { + async { + // It's a single threaded tokio runtime, so we can still use a thread local + u64_counter!("test", "test description", 1, "attr" => "val"); + assert_counter!("test", 1, "attr" => "val"); + } + .with_metrics() + .await; + } +``` + +## Callsite instrument caching + +When using the new metrics macros a reference to an instrument is cached to ensure that the meter provider does not have to be queried over and over. + +```mermaid + +flowchart TD + Callsite --> RefCheck + RefCheck -->|not upgradable| Create + RefCheck -->|upgradable| Use + Create --> Store + Store --> Use + RefCheck{"Static\nMutex < Weak < Instrument > >"} + Create("Create instrument Arc < Instrument >") + Store("Store downgraded clone in Mutex") + Use("Use strong reference to instrument") +``` + +Aggregate meter provider is responsible for maintaining a strong reference to all instruments that are valid. + +Strong references to instruments will be discarded when changes to the aggregate meter provider take place. This will cause every callsite to refresh its reference to the instrument. + +On the fast path the mutex is locked for the period that it takes to upgrade the weak reference. This is a fast operation, and should not block the thread for any meaningful period of time. + +If there is shown to be contention in future profiling we can revisit. diff --git a/docs/source/configuration/metrics.mdx b/docs/source/configuration/metrics.mdx index f85ff5d428..bf6ed273ac 100644 --- a/docs/source/configuration/metrics.mdx +++ b/docs/source/configuration/metrics.mdx @@ -213,7 +213,7 @@ telemetry: path: .type # JSON query path to fetch data from extensions - name: message path: .reason - # Will create this kind of metric for example apollo_router_http_requests_error_total{message="cannot contact the subgraph",service_name="apollo-router",subgraph="my_subgraph_name",subgraph_error_extended_type="SubrequestHttpError"} + # Will create this kind of metric for example apollo_router_http_requests_error_total{message="cannot contact the subgraph",subgraph="my_subgraph_name",subgraph_error_extended_type="SubrequestHttpError"} subgraphs: my_subgraph_name: # Apply these rules only for the subgraph named `my_subgraph_name` request: