Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(telemetry): add metrics for query plan warmup and schema loading #3807

Merged
merged 10 commits into from
Sep 15, 2023
42 changes: 42 additions & 0 deletions .changesets/feat_bnjjj_fix_3767.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
### feat(telemetry): add metrics for query plan warmup and schema load ([Issue #3767](https://github.com/apollographql/router/issues/3767))

It adds histogram metrics for `apollo_router_query_planning_warmup_duration` and `apollo_router_schema_load_duration`.

Example in Prometheus:

```
# HELP apollo_router_query_planning_warmup_duration apollo_router_query_planning_warmup_duration
# TYPE apollo_router_query_planning_warmup_duration histogram
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.05"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.1"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.25"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.5"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="2.5"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="100"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1000"} 1
apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1
apollo_router_query_planning_warmup_duration_sum{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 0.022390619
apollo_router_query_planning_warmup_duration_count{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 1
# HELP apollo_router_schema_load_duration apollo_router_schema_load_duration
# TYPE apollo_router_schema_load_duration histogram
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.05"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.1"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.25"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.5"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="2.5"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="100"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1000"} 8
apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 8
apollo_router_schema_load_duration_sum{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 0.023486205999999996
apollo_router_schema_load_duration_count{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 8
```

By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/3807
6 changes: 4 additions & 2 deletions apollo-router/src/plugins/telemetry/metrics/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ impl<T: MeterProvider> FilterMeterProvider<T> {
FilterMeterProvider::builder()
.delegate(delegate)
.allow(
Regex::new(r"apollo\.(graphos\.cloud|router\.(operations?|config))(\..*|$)")
.expect("regex should have been valid"),
Regex::new(
r"apollo\.(graphos\.cloud|router\.(operations?|config|schema|query))(\..*|$)",
)
.expect("regex should have been valid"),
)
.build()
}
Expand Down
38 changes: 38 additions & 0 deletions apollo-router/src/plugins/telemetry/utils.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use std::time::Duration;
use std::time::Instant;

use tracing_core::field::Value;

pub(crate) trait TracingUtils {
Expand All @@ -13,3 +16,38 @@ impl TracingUtils for bool {
}
}
}

/// Timer implementing Drop to automatically compute the duration between the moment it has been created until it's dropped
///```ignore
/// Timer::new(|duration| {
/// tracing::info!(histogram.apollo_router_test = duration.as_secs_f64());
/// })
/// ```
pub(crate) struct Timer<F>
where
F: FnOnce(Duration),
{
start: Instant,
f: Option<F>,
}

impl<F> Timer<F>
where
F: FnOnce(Duration),
{
pub(crate) fn new(f: F) -> Self {
Self {
start: Instant::now(),
f: f.into(),
}
}
}

impl<F> Drop for Timer<F>
where
F: FnOnce(Duration),
{
fn drop(&mut self) {
self.f.take().expect("f must exist")(self.start.elapsed())
}
}
2 changes: 1 addition & 1 deletion apollo-router/src/query_planner/bridge_query_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ impl Service<QueryPlannerRequest> for BridgeQueryPlanner {
)
.await;
let duration = start.elapsed().as_secs_f64();
tracing::info!(histogram.apollo_router_query_planning_time = duration,);
tracing::info!(histogram.apollo_router_query_planning_time = duration);

match res {
Ok(query_planner_content) => Ok(QueryPlannerResponse::builder()
Expand Down
6 changes: 6 additions & 0 deletions apollo-router/src/query_planner/caching_query_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use crate::error::CacheResolverError;
use crate::error::QueryPlannerError;
use crate::plugins::authorization::AuthorizationPlugin;
use crate::plugins::authorization::CacheKeyMetadata;
use crate::plugins::telemetry::utils::Timer;
use crate::query_planner::labeler::add_defer_labels;
use crate::query_planner::BridgeQueryPlanner;
use crate::query_planner::QueryPlanResult;
Expand Down Expand Up @@ -107,6 +108,11 @@ where
query_analysis: &QueryAnalysisLayer,
cache_keys: Vec<WarmUpCachingQueryKey>,
) {
let _timer = Timer::new(|duration| {
::tracing::info!(
histogram.apollo.router.query.planning.warmup.duration = duration.as_secs_f64()
);
});
let schema_id = self.schema.schema_id.clone();

let mut service = ServiceBuilder::new().service(
Expand Down
5 changes: 5 additions & 0 deletions apollo-router/src/spec/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;

use apollo_compiler::diagnostics::ApolloDiagnostic;
use apollo_compiler::ApolloCompiler;
Expand Down Expand Up @@ -61,6 +62,7 @@ impl Schema {
}

pub(crate) fn parse(sdl: &str, configuration: &Configuration) -> Result<Self, SchemaError> {
let start = Instant::now();
let mut compiler = ApolloCompiler::new();
let id = compiler.add_type_system(sdl, "schema.graphql");

Expand Down Expand Up @@ -129,6 +131,9 @@ impl Schema {
let mut hasher = Sha256::new();
hasher.update(sdl.as_bytes());
let schema_id = Some(format!("{:x}", hasher.finalize()));
tracing::info!(
histogram.apollo.router.schema.load.duration = start.elapsed().as_secs_f64()
);

Ok(Schema {
raw_sdl: Arc::new(sdl.to_string()),
Expand Down
3 changes: 3 additions & 0 deletions docs/source/configuration/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ The coprocessor operations metric has the following attributes:

- `apollo_router_processing_time` - Time spent processing a request (outside of waiting for external or subgraph requests) in seconds.
- `apollo_router_query_planning_time` - Time spent planning queries in seconds.
- `apollo_router_query_planning_warmup.duration` - Time spent planning queries in seconds.
- `apollo_router_schema_load.duration` - Time spent loading the schema in seconds.
bnjjj marked this conversation as resolved.
Show resolved Hide resolved

#### Uplink

Expand All @@ -121,6 +123,7 @@ The coprocessor operations metric has the following attributes:
Note that the initial call to uplink during router startup will not be reflected in metrics.

#### Subscription

- `apollo_router_opened_subscriptions` - Number of different opened subscriptions (not the number of clients with an opened subscriptions in case it's deduplicated)
- `apollo_router_deduplicated_subscriptions_total` - Number of subscriptions that has been deduplicated
- `apollo_router_skipped_event_count` - Number of subscription events that has been skipped because too many events have been received from the subgraph but not yet sent to the client.
Expand Down