From bc65b991f15ff0015c980e765ab39ae6c2841460 Mon Sep 17 00:00:00 2001 From: bryncooke Date: Wed, 9 Jun 2021 18:46:22 +0200 Subject: [PATCH] Open telemetry support This PR adds instrumentation to emit the following spans: * federation.request - total time to server a request. * federation.validate - time to validate the federated query. * federation.plan - time to plan the federated query. * federation.execute - the total time executing query. * federation.fetch - time fetching data from a subgraph. * federation.postprocessing - time to render the response from all the subgraph data. The instrumentation looks a little messy due to the requirement to set error status on spans on exception, but also if errors are added to the response. --- .../package.json | 4 +- .../src/snapshotSerializers/index.ts | 1 + .../readableSpanArraySerializer.ts | 39 +++ gateway-js/CHANGELOG.md | 3 +- gateway-js/package.json | 3 +- .../__snapshots__/opentelemetry.test.ts.snap | 187 ++++++++++++ .../__tests__/gateway/opentelemetry.test.ts | 121 ++++++++ gateway-js/src/executeQueryPlan.ts | 280 ++++++++++-------- gateway-js/src/index.ts | 279 +++++++++-------- gateway-js/src/utilities/opentelemetry.ts | 8 + package-lock.json | 276 ++++++++++++++++- package.json | 1 + 12 files changed, 961 insertions(+), 241 deletions(-) create mode 100644 federation-integration-testsuite-js/src/snapshotSerializers/readableSpanArraySerializer.ts create mode 100644 gateway-js/src/__tests__/gateway/__snapshots__/opentelemetry.test.ts.snap create mode 100644 gateway-js/src/__tests__/gateway/opentelemetry.test.ts create mode 100644 gateway-js/src/utilities/opentelemetry.ts diff --git a/federation-integration-testsuite-js/package.json b/federation-integration-testsuite-js/package.json index 186a9b429..020481321 100644 --- a/federation-integration-testsuite-js/package.json +++ b/federation-integration-testsuite-js/package.json @@ -19,6 +19,8 @@ "homepage": "https://github.com/apollographql/federation#readme", "dependencies": { "apollo-graphql": "^0.9.3", - "graphql-tag": "^2.10.4" + "graphql-tag": "^2.10.4", + "@opentelemetry/api": "^0.20.0", + "@opentelemetry/tracing": "^0.20.0" } } diff --git a/federation-integration-testsuite-js/src/snapshotSerializers/index.ts b/federation-integration-testsuite-js/src/snapshotSerializers/index.ts index 770cb63e7..60d417c1c 100644 --- a/federation-integration-testsuite-js/src/snapshotSerializers/index.ts +++ b/federation-integration-testsuite-js/src/snapshotSerializers/index.ts @@ -3,4 +3,5 @@ export { queryPlanSerializer } from '@apollo/query-planner'; export { default as selectionSetSerializer } from './selectionSetSerializer'; export { default as typeSerializer } from './typeSerializer'; export { default as graphqlErrorSerializer } from './graphqlErrorSerializer'; +export { default as spanSerializer } from './readableSpanArraySerializer'; diff --git a/federation-integration-testsuite-js/src/snapshotSerializers/readableSpanArraySerializer.ts b/federation-integration-testsuite-js/src/snapshotSerializers/readableSpanArraySerializer.ts new file mode 100644 index 000000000..1ee08ceec --- /dev/null +++ b/federation-integration-testsuite-js/src/snapshotSerializers/readableSpanArraySerializer.ts @@ -0,0 +1,39 @@ +import { Plugin } from 'pretty-format'; +import { ReadableSpan } from '@opentelemetry/tracing' + +export default { + test(value: any) { + return value && Array.isArray(value) && value.length > 0 && isReadableSpan(value[0]); + }, + + print(spans: ReadableSpan[]) { + // This method takes an array of spans and builds up a set of trees containing only the information we are interested in. + // Spans contain the ID of it's parent, unless it is a root span with no parent. + // + // The simplified data model is called a span mirror. + // + // The algorithm is: + // 1. create span mirrors for every span and place it in a map. + // For each span: + // 1. find the corresponding mirrors for itself and parent. + // 2. push the parent in to the parent's children array. + + const root = {children:[]}; + const spanMirrors = new Map(); + spanMirrors.set(undefined, root); + spans.forEach((s) => spanMirrors.set(s, {name: s.name, attributes:s.attributes, children: [], status: s.status})); + spans.forEach((s) => { + const spanMirror = spanMirrors.get(s); + const parentSpan = spans.find((s2) => s2.spanContext().spanId === s.parentSpanId); + const parentSpanMirror = spanMirrors.get(parentSpan); + parentSpanMirror.children.push(spanMirror); + }); + + return JSON.stringify(root.children, undefined, 2); + }, +} as Plugin; + +function isReadableSpan(arg: any): arg is ReadableSpan { + let isSpan = arg && 'kind' in arg && 'startTime' in arg && 'parentSpanId' in arg; + return isSpan; +} diff --git a/gateway-js/CHANGELOG.md b/gateway-js/CHANGELOG.md index 117ea9e93..8269846ae 100644 --- a/gateway-js/CHANGELOG.md +++ b/gateway-js/CHANGELOG.md @@ -4,7 +4,8 @@ > The changes noted within this `vNEXT` section have not been released yet. New PRs and commits which introduce changes should include an entry in this `vNEXT` section as part of their development. When a release is being prepared, a new header will be (manually) created below and the appropriate changes within that release will be moved into the new section. -- Take subtypes into account when matching type conditions to extract representations. [PR #804](https://github.com/apollographql/federation/pull/804) +- Take subtypes into account when matching type conditions to extract representations. [PR #804](https://github.com/apollographql/federation/pull/804) +- OpenTelemetry support. [803](https://github.com/apollographql/federation/pull/803) ## v0.28.3 diff --git a/gateway-js/package.json b/gateway-js/package.json index 48b6f414a..b7ca2a8de 100644 --- a/gateway-js/package.json +++ b/gateway-js/package.json @@ -37,7 +37,8 @@ "apollo-server-types": "^0.9.0", "loglevel": "^1.6.1", "make-fetch-happen": "^8.0.0", - "pretty-format": "^26.0.0" + "pretty-format": "^26.0.0", + "@opentelemetry/api": "^0.20.0" }, "peerDependencies": { "graphql": "^14.5.0 || ^15.0.0" diff --git a/gateway-js/src/__tests__/gateway/__snapshots__/opentelemetry.test.ts.snap b/gateway-js/src/__tests__/gateway/__snapshots__/opentelemetry.test.ts.snap new file mode 100644 index 000000000..94a25c0b5 --- /dev/null +++ b/gateway-js/src/__tests__/gateway/__snapshots__/opentelemetry.test.ts.snap @@ -0,0 +1,187 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`opentelemetry receives spans on fetch failure 1`] = ` +[ + { + "name": "gateway.request", + "attributes": {}, + "children": [ + { + "name": "gateway.validate", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.plan", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.execute", + "attributes": {}, + "children": [ + { + "name": "gateway.fetch", + "attributes": { + "service": "product" + }, + "children": [], + "status": { + "code": 2 + } + }, + { + "name": "gateway.postprocessing", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + } + ], + "status": { + "code": 2 + } + } + ], + "status": { + "code": 2 + } + } +] +`; + +exports[`opentelemetry with local data receives spans on plan failure 1`] = ` +[ + { + "name": "gateway.request", + "attributes": {}, + "children": [ + { + "name": "gateway.validate", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.plan", + "attributes": {}, + "children": [], + "status": { + "code": 2 + } + } + ], + "status": { + "code": 2 + } + } +] +`; + +exports[`opentelemetry with local data receives spans on success 1`] = ` +[ + { + "name": "gateway.request", + "attributes": {}, + "children": [ + { + "name": "gateway.validate", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.plan", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.execute", + "attributes": {}, + "children": [ + { + "name": "gateway.fetch", + "attributes": { + "service": "product" + }, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.fetch", + "attributes": { + "service": "books" + }, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.fetch", + "attributes": { + "service": "product" + }, + "children": [], + "status": { + "code": 0 + } + }, + { + "name": "gateway.postprocessing", + "attributes": {}, + "children": [], + "status": { + "code": 0 + } + } + ], + "status": { + "code": 0 + } + } + ], + "status": { + "code": 0 + } + } +] +`; + +exports[`opentelemetry with local data receives spans on validation failure 1`] = ` +[ + { + "name": "gateway.request", + "attributes": {}, + "children": [ + { + "name": "gateway.validate", + "attributes": {}, + "children": [], + "status": { + "code": 2 + } + } + ], + "status": { + "code": 2 + } + } +] +`; diff --git a/gateway-js/src/__tests__/gateway/opentelemetry.test.ts b/gateway-js/src/__tests__/gateway/opentelemetry.test.ts new file mode 100644 index 000000000..7783a0fed --- /dev/null +++ b/gateway-js/src/__tests__/gateway/opentelemetry.test.ts @@ -0,0 +1,121 @@ +import gql from 'graphql-tag'; +import {ApolloGateway, LocalGraphQLDataSource} from '../../'; +import {fixtures, spanSerializer} from 'apollo-federation-integration-testsuite'; +import {fetch} from '../../__mocks__/apollo-server-env'; +import {InMemorySpanExporter, SimpleSpanProcessor} from '@opentelemetry/tracing' +import {NodeTracerProvider} from '@opentelemetry/node'; +import { buildFederatedSchema } from '@apollo/federation'; + +expect.addSnapshotSerializer(spanSerializer); + +const inMemorySpans = new InMemorySpanExporter(); +const tracerProvider = new NodeTracerProvider(); +tracerProvider.addSpanProcessor(new SimpleSpanProcessor(inMemorySpans)); +tracerProvider.register(); + +beforeEach(() => { + fetch.mockReset(); + inMemorySpans.reset(); +}); + +describe('opentelemetry', () => { + async function execute(executor: any, source: string, variables: any) { + await executor({ + source, + document: gql(source), + request: { + variables: variables, + }, + queryHash: 'hashed', + context: null, + cache: {} as any, + }); + } + + describe('with local data', () => + { + async function gateway() { + const gateway = new ApolloGateway({ + localServiceList: fixtures, + buildService: service => { + // @ts-ignore + return new LocalGraphQLDataSource(buildFederatedSchema([service])); + }, + }); + + const {executor} = await gateway.load(); + return executor; + } + + it('receives spans on success', async () => { + const executor = await gateway(); + + const source = `#graphql + query GetProduct($upc: String!) { + product(upc: $upc) { + name + } + } + `; + + await execute(executor, source, {upc: '1'}); + expect(inMemorySpans.getFinishedSpans()).toMatchSnapshot(); + }); + + it('receives spans on validation failure', async () => { + const executor = await gateway(); + const source = `#graphql + query InvalidVariables($first: Int!) { + topReviews(first: $first) { + body + } + } + `; + + await execute(executor, source, {upc: '1'}); + expect(inMemorySpans.getFinishedSpans()).toMatchSnapshot(); + }); + + it('receives spans on plan failure', async () => { + const executor = await gateway(); + const source = `#graphql + subscription GetProduct($upc: String!) { + product(upc: $upc) { + name + } + } + `; + + try { + await execute(executor, source, {upc: '1'}); + } + catch(err) {} + expect(inMemorySpans.getFinishedSpans()).toMatchSnapshot(); + }); + }); + + + it('receives spans on fetch failure', async () => { + + fetch.mockImplementationOnce(async () => { + throw Error("Nooo"); + }); + + const gateway = new ApolloGateway({ + localServiceList: fixtures, + }); + + const { executor } = await gateway.load(); + + const source = `#graphql + query GetProduct($upc: String!) { + product(upc: $upc) { + name + } + } + `; + + await execute(executor, source, {upc: '1'}); + expect(inMemorySpans.getFinishedSpans()).toMatchSnapshot(); + }); +}); diff --git a/gateway-js/src/executeQueryPlan.ts b/gateway-js/src/executeQueryPlan.ts index 6d7cd0866..0f90a4397 100644 --- a/gateway-js/src/executeQueryPlan.ts +++ b/gateway-js/src/executeQueryPlan.ts @@ -28,6 +28,10 @@ import { } from '@apollo/query-planner'; import { deepMerge } from './utilities/deepMerge'; import { isNotNullOrUndefined } from './utilities/array'; +import { default as opentelemetry, SpanStatusCode } from "@opentelemetry/api"; +import {OpenTelemetrySpanNames} from "./utilities/opentelemetry"; + +const tracer = opentelemetry.trace.getTracer('default'); export type ServiceMap = { [serviceName: string]: GraphQLDataSource; @@ -49,63 +53,88 @@ export async function executeQueryPlan( requestContext: GraphQLRequestContext, operationContext: OperationContext, ): Promise { - const errors: GraphQLError[] = []; - - const context: ExecutionContext = { - queryPlan, - operationContext, - serviceMap, - requestContext, - errors, - }; + return tracer.startActiveSpan(OpenTelemetrySpanNames.EXECUTE, async span => { + try { + const errors: GraphQLError[] = []; + + const context: ExecutionContext = { + queryPlan, + operationContext, + serviceMap, + requestContext, + errors, + }; - let data: ResultMap | undefined | null = Object.create(null); + let data: ResultMap | undefined | null = Object.create(null); - const captureTraces = !!( - requestContext.metrics && requestContext.metrics.captureTraces - ); + const captureTraces = !!( + requestContext.metrics && requestContext.metrics.captureTraces + ); - if (queryPlan.node) { - const traceNode = await executeNode( - context, - queryPlan.node, - data!, - [], - captureTraces, - ); - if (captureTraces) { - requestContext.metrics!.queryPlanTrace = traceNode; - } - } + if (queryPlan.node) { + const traceNode = await executeNode( + context, + queryPlan.node, + data!, + [], + captureTraces, + ); + if (captureTraces) { + requestContext.metrics!.queryPlanTrace = traceNode; + } + } - // FIXME: Re-executing the query is a pretty heavy handed way of making sure - // only explicitly requested fields are included and field ordering follows - // the original query. - // It is also used to allow execution of introspection queries though. - try { - const executionResult = await execute({ - schema: operationContext.schema, - document: { - kind: Kind.DOCUMENT, - definitions: [ - operationContext.operation, - ...Object.values(operationContext.fragments), - ], - }, - rootValue: data, - variableValues: requestContext.request.variables, - // See also `wrapSchemaWithAliasResolver` in `gateway-js/src/index.ts`. - fieldResolver: defaultFieldResolverWithAliasSupport, - }); - data = executionResult.data; - if (executionResult.errors?.length) { - errors.push(...executionResult.errors) - } - } catch (error) { - return { errors: [error] }; - } + let result = await tracer.startActiveSpan(OpenTelemetrySpanNames.POST_PROCESSING, async (span) => { - return errors.length === 0 ? { data } : { errors, data }; + // FIXME: Re-executing the query is a pretty heavy handed way of making sure + // only explicitly requested fields are included and field ordering follows + // the original query. + // It is also used to allow execution of introspection queries though. + try { + const executionResult = await execute({ + schema: operationContext.schema, + document: { + kind: Kind.DOCUMENT, + definitions: [ + operationContext.operation, + ...Object.values(operationContext.fragments), + ], + }, + rootValue: data, + variableValues: requestContext.request.variables, + // See also `wrapSchemaWithAliasResolver` in `gateway-js/src/index.ts`. + fieldResolver: defaultFieldResolverWithAliasSupport, + }); + data = executionResult.data; + if (executionResult.errors?.length) { + errors.push(...executionResult.errors) + } + } catch (error) { + span.setStatus({ code:SpanStatusCode.ERROR }); + return { errors: [error] }; + } + finally { + span.end() + } + if(errors.length > 0) { + span.setStatus({ code:SpanStatusCode.ERROR }); + } + return errors.length === 0 ? { data } : { errors, data }; + }); + + if(result.errors) { + span.setStatus({ code:SpanStatusCode.ERROR }); + } + return result; + } + catch (err) { + span.setStatus({ code:SpanStatusCode.ERROR }); + throw err; + } + finally { + span.end(); + } + }); } // Note: this function always returns a protobuf QueryPlanNode tree, even if @@ -203,50 +232,54 @@ async function executeFetch( _path: ResponsePath, traceNode: Trace.QueryPlanNode.FetchNode | null, ): Promise { + const logger = context.requestContext.logger || console; const service = context.serviceMap[fetch.serviceName]; - if (!service) { - throw new Error(`Couldn't find service with name "${fetch.serviceName}"`); - } - let entities: ResultMap[]; - if (Array.isArray(results)) { - // Remove null or undefined entities from the list - entities = results.filter(isNotNullOrUndefined); - } else { - entities = [results]; - } + return tracer.startActiveSpan(OpenTelemetrySpanNames.FETCH, {attributes:{service:fetch.serviceName}}, async span => { + try { + if (!service) { + throw new Error(`Couldn't find service with name "${fetch.serviceName}"`); + } - if (entities.length < 1) return; + let entities: ResultMap[]; + if (Array.isArray(results)) { + // Remove null or undefined entities from the list + entities = results.filter(isNotNullOrUndefined); + } else { + entities = [results]; + } - let variables = Object.create(null); - if (fetch.variableUsages) { - for (const variableName of fetch.variableUsages) { - const providedVariables = context.requestContext.request.variables; - if ( - providedVariables && - typeof providedVariables[variableName] !== 'undefined' - ) { - variables[variableName] = providedVariables[variableName]; + if (entities.length < 1) return; + + let variables = Object.create(null); + if (fetch.variableUsages) { + for (const variableName of fetch.variableUsages) { + const providedVariables = context.requestContext.request.variables; + if ( + providedVariables && + typeof providedVariables[variableName] !== 'undefined' + ) { + variables[variableName] = providedVariables[variableName]; + } + } } - } - } - if (!fetch.requires) { - const dataReceivedFromService = await sendOperation( - context, - fetch.operation, - variables, - ); + if (!fetch.requires) { + const dataReceivedFromService = await sendOperation( + context, + fetch.operation, + variables, + ); - for (const entity of entities) { - deepMerge(entity, dataReceivedFromService); - } - } else { - const requires = fetch.requires; + for (const entity of entities) { + deepMerge(entity, dataReceivedFromService); + } + } else { + const requires = fetch.requires; - const representations: ResultMap[] = []; - const representationToEntity: number[] = []; + const representations: ResultMap[] = []; + const representationToEntity: number[] = []; entities.forEach((entity, index) => { const representation = executeSelectionSet( @@ -260,46 +293,55 @@ async function executeFetch( } }); - // If there are no representations, that means the type conditions in - // the requires don't match any entities. - if (representations.length < 1) return; + // If there are no representations, that means the type conditions in + // the requires don't match any entities. + if (representations.length < 1) return; - if ('representations' in variables) { - throw new Error(`Variables cannot contain key "representations"`); - } + if ('representations' in variables) { + throw new Error(`Variables cannot contain key "representations"`); + } + + const dataReceivedFromService = await sendOperation( + context, + fetch.operation, + {...variables, representations}, + ); - const dataReceivedFromService = await sendOperation( - context, - fetch.operation, - { ...variables, representations }, - ); + if (!dataReceivedFromService) { + return; + } - if (!dataReceivedFromService) { - return; - } + if ( + !( + dataReceivedFromService._entities && + Array.isArray(dataReceivedFromService._entities) + ) + ) { + throw new Error(`Expected "data._entities" in response to be an array`); + } - if ( - !( - dataReceivedFromService._entities && - Array.isArray(dataReceivedFromService._entities) - ) - ) { - throw new Error(`Expected "data._entities" in response to be an array`); - } + const receivedEntities = dataReceivedFromService._entities; - const receivedEntities = dataReceivedFromService._entities; + if (receivedEntities.length !== representations.length) { + throw new Error( + `Expected "data._entities" to contain ${representations.length} elements`, + ); + } - if (receivedEntities.length !== representations.length) { - throw new Error( - `Expected "data._entities" to contain ${representations.length} elements`, - ); + for (let i = 0; i < entities.length; i++) { + deepMerge(entities[representationToEntity[i]], receivedEntities[i]); + } + } } - - for (let i = 0; i < entities.length; i++) { - deepMerge(entities[representationToEntity[i]], receivedEntities[i]); + catch (err) { + span.setStatus({ code:SpanStatusCode.ERROR }); + throw err; } - } - + finally + { + span.end(); + } + }); async function sendOperation( context: ExecutionContext, source: string, diff --git a/gateway-js/src/index.ts b/gateway-js/src/index.ts index e450b164c..dfb4fb4d9 100644 --- a/gateway-js/src/index.ts +++ b/gateway-js/src/index.ts @@ -70,6 +70,10 @@ import { import { loadSupergraphSdlFromStorage } from './loadSupergraphSdlFromStorage'; import { getServiceDefinitionsFromStorage } from './legacyLoadServicesFromStorage'; import { buildComposedSchema } from '@apollo/query-planner'; +import { default as opentelemetry, SpanStatusCode } from "@opentelemetry/api"; +import {OpenTelemetrySpanNames} from "./utilities/opentelemetry"; + +const tracer = opentelemetry.trace.getTracer('gateway'); type DataSourceMap = { [serviceName: string]: { url?: string; dataSource: GraphQLDataSource }; @@ -928,142 +932,185 @@ export class ApolloGateway implements GraphQLService { public executor = async ( requestContext: GraphQLRequestContextExecutionDidStart, ): Promise => { - const { request, document, queryHash } = requestContext; - const queryPlanStoreKey = queryHash + (request.operationName || ''); - const operationContext = buildOperationContext({ - schema: this.schema!, - operationDocument: document, - operationName: request.operationName, - }); + return tracer.startActiveSpan(OpenTelemetrySpanNames.REQUEST, async span => { + try { + const {request, document, queryHash} = requestContext; + const queryPlanStoreKey = queryHash + (request.operationName || ''); + const operationContext = buildOperationContext({ + schema: this.schema!, + operationDocument: document, + operationName: request.operationName, + }); - // No need to build a query plan if we know the request is invalid beforehand - // In the future, this should be controlled by the requestPipeline - const validationErrors = this.validateIncomingRequest( - requestContext, - operationContext, - ); - if (validationErrors.length > 0) { - return { errors: validationErrors }; - } + // No need to build a query plan if we know the request is invalid beforehand + // In the future, this should be controlled by the requestPipeline + const validationErrors = this.validateIncomingRequest( + requestContext, + operationContext, + ); - let queryPlan: QueryPlan | undefined; - if (this.queryPlanStore) { - queryPlan = await this.queryPlanStore.get(queryPlanStoreKey); - } + if (validationErrors.length > 0) { + span.setStatus({ code:SpanStatusCode.ERROR }); + return {errors: validationErrors}; + } - if (!queryPlan) { - // TODO(#631): Can we be sure the query planner has been initialized here? - queryPlan = this.queryPlanner!.buildQueryPlan(operationContext, { - autoFragmentization: Boolean( - this.config.experimental_autoFragmentization, - ), - }); - if (this.queryPlanStore) { - // The underlying cache store behind the `documentStore` returns a - // `Promise` which is resolved (or rejected), eventually, based on the - // success or failure (respectively) of the cache save attempt. While - // it's certainly possible to `await` this `Promise`, we don't care about - // whether or not it's successful at this point. We'll instead proceed - // to serve the rest of the request and just hope that this works out. - // If it doesn't work, the next request will have another opportunity to - // try again. Errors will surface as warnings, as appropriate. - // - // While it shouldn't normally be necessary to wrap this `Promise` in a - // `Promise.resolve` invocation, it seems that the underlying cache store - // is returning a non-native `Promise` (e.g. Bluebird, etc.). - Promise.resolve( - this.queryPlanStore.set(queryPlanStoreKey, queryPlan), - ).catch((err) => - this.logger.warn( - 'Could not store queryPlan' + ((err && err.message) || err), - ), - ); - } - } + let queryPlan: QueryPlan | undefined; + if (this.queryPlanStore) { + queryPlan = await this.queryPlanStore.get(queryPlanStoreKey); + } - const serviceMap: ServiceMap = Object.entries(this.serviceMap).reduce( - (serviceDataSources, [serviceName, { dataSource }]) => { - serviceDataSources[serviceName] = dataSource; - return serviceDataSources; - }, - Object.create(null) as ServiceMap, - ); + if (!queryPlan) { + queryPlan = tracer.startActiveSpan(OpenTelemetrySpanNames.PLAN, span => { + try { + // TODO(#631): Can we be sure the query planner has been initialized here? + return this.queryPlanner!.buildQueryPlan(operationContext, { + autoFragmentization: Boolean( + this.config.experimental_autoFragmentization, + ), + }); + } + catch (err) { + span.setStatus({ code:SpanStatusCode.ERROR }); + throw err; + } + finally { + span.end(); + } + }); + + if (this.queryPlanStore) { + // The underlying cache store behind the `documentStore` returns a + // `Promise` which is resolved (or rejected), eventually, based on the + // success or failure (respectively) of the cache save attempt. While + // it's certainly possible to `await` this `Promise`, we don't care about + // whether or not it's successful at this point. We'll instead proceed + // to serve the rest of the request and just hope that this works out. + // If it doesn't work, the next request will have another opportunity to + // try again. Errors will surface as warnings, as appropriate. + // + // While it shouldn't normally be necessary to wrap this `Promise` in a + // `Promise.resolve` invocation, it seems that the underlying cache store + // is returning a non-native `Promise` (e.g. Bluebird, etc.). + Promise.resolve( + this.queryPlanStore.set(queryPlanStoreKey, queryPlan), + ).catch((err) => + this.logger.warn( + 'Could not store queryPlan' + ((err && err.message) || err), + ), + ); + } + } - if (this.experimental_didResolveQueryPlan) { - this.experimental_didResolveQueryPlan({ - queryPlan, - serviceMap, - requestContext, - operationContext, - }); - } + const serviceMap: ServiceMap = Object.entries(this.serviceMap).reduce( + (serviceDataSources, [serviceName, {dataSource}]) => { + serviceDataSources[serviceName] = dataSource; + return serviceDataSources; + }, + Object.create(null) as ServiceMap, + ); - const response = await executeQueryPlan( - queryPlan, - serviceMap, - requestContext, - operationContext, - ); + if (this.experimental_didResolveQueryPlan) { + this.experimental_didResolveQueryPlan({ + queryPlan, + serviceMap, + requestContext, + operationContext, + }); + } - const shouldShowQueryPlan = - this.config.__exposeQueryPlanExperimental && - request.http && - request.http.headers && - request.http.headers.get('Apollo-Query-Plan-Experimental'); - - // We only want to serialize the query plan if we're going to use it, which is - // in two cases: - // 1) non-empty query plan and config.debug === true - // 2) non-empty query plan and shouldShowQueryPlan === true - const serializedQueryPlan = - queryPlan.node && (this.config.debug || shouldShowQueryPlan) - ? // FIXME: I disabled printing the query plan because this lead to a - // circular dependency between the `@apollo/gateway` and - // `apollo-federation-integration-testsuite` packages. - // We should either solve that or switch Playground to - // the JSON serialization format. - prettyFormatQueryPlan(queryPlan) - : null; - - if (this.config.debug && serializedQueryPlan) { - this.logger.debug(serializedQueryPlan); - } + const response = await executeQueryPlan( + queryPlan, + serviceMap, + requestContext, + operationContext, + ); - if (shouldShowQueryPlan) { - // TODO: expose the query plan in a more flexible JSON format in the future - // and rename this to `queryPlan`. Playground should cutover to use the new - // option once we've built a way to print that representation. - - // In the case that `serializedQueryPlan` is null (on introspection), we - // still want to respond to Playground with something truthy since it depends - // on this to decide that query plans are supported by this gateway. - response.extensions = { - __queryPlanExperimental: serializedQueryPlan || true, - }; - } - return response; + const shouldShowQueryPlan = + this.config.__exposeQueryPlanExperimental && + request.http && + request.http.headers && + request.http.headers.get('Apollo-Query-Plan-Experimental'); + + // We only want to serialize the query plan if we're going to use it, which is + // in two cases: + // 1) non-empty query plan and config.debug === true + // 2) non-empty query plan and shouldShowQueryPlan === true + const serializedQueryPlan = + queryPlan.node && (this.config.debug || shouldShowQueryPlan) + ? // FIXME: I disabled printing the query plan because this lead to a + // circular dependency between the `@apollo/gateway` and + // `apollo-federation-integration-testsuite` packages. + // We should either solve that or switch Playground to + // the JSON serialization format. + prettyFormatQueryPlan(queryPlan) + : null; + + if (this.config.debug && serializedQueryPlan) { + this.logger.debug(serializedQueryPlan); + } + + if (shouldShowQueryPlan) { + // TODO: expose the query plan in a more flexible JSON format in the future + // and rename this to `queryPlan`. Playground should cutover to use the new + // option once we've built a way to print that representation. + + // In the case that `serializedQueryPlan` is null (on introspection), we + // still want to respond to Playground with something truthy since it depends + // on this to decide that query plans are supported by this gateway. + response.extensions = { + __queryPlanExperimental: serializedQueryPlan || true, + }; + } + if(response.errors) { + span.setStatus({ code:SpanStatusCode.ERROR }); + } + return response; + } + catch (err) { + span.setStatus({ code:SpanStatusCode.ERROR }); + throw err; + } + finally { + span.end(); + } + }); }; private validateIncomingRequest( requestContext: GraphQLRequestContextExecutionDidStart, operationContext: OperationContext, ) { - // casting out of `readonly` - const variableDefinitions = operationContext.operation - .variableDefinitions as VariableDefinitionNode[] | undefined; + return tracer.startActiveSpan(OpenTelemetrySpanNames.VALIDATE, span => { + try { + // casting out of `readonly` + const variableDefinitions = operationContext.operation + .variableDefinitions as VariableDefinitionNode[] | undefined; - if (!variableDefinitions) return []; + if (!variableDefinitions) return []; - const { errors } = getVariableValues( - operationContext.schema, - variableDefinitions, - requestContext.request.variables || {}, - ); + const {errors} = getVariableValues( + operationContext.schema, + variableDefinitions, + requestContext.request.variables || {}, + ); - return errors || []; + if(errors) { + span.setStatus({ code:SpanStatusCode.ERROR }); + } + return errors || []; + } + catch (err) { + span.setStatus({ code:SpanStatusCode.ERROR }); + throw err; + } + finally { + span.end(); + } + }); } + // Stops all processes involved with the gateway (for now, just background // schema polling). Can be called multiple times safely. Once it (async) // returns, all gateway background activity will be finished. diff --git a/gateway-js/src/utilities/opentelemetry.ts b/gateway-js/src/utilities/opentelemetry.ts new file mode 100644 index 000000000..0ed406760 --- /dev/null +++ b/gateway-js/src/utilities/opentelemetry.ts @@ -0,0 +1,8 @@ +export enum OpenTelemetrySpanNames { + REQUEST = "gateway.request", + PLAN = "gateway.plan", + FETCH = "gateway.fetch", + POST_PROCESSING = "gateway.postprocessing", + EXECUTE = "gateway.execute", + VALIDATE = "gateway.validate", +} diff --git a/package-lock.json b/package-lock.json index 4826f9f14..91d88fbde 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "@graphql-codegen/cli": "1.21.5", "@graphql-codegen/typescript": "1.22.1", "@graphql-codegen/typescript-operations": "1.18.0", + "@opentelemetry/node": "^0.20.0", "@rollup/plugin-commonjs": "19.0.0", "@rollup/plugin-node-resolve": "13.0.0", "@types/bunyan": "1.8.6", @@ -66,6 +67,8 @@ "version": "0.25.1", "license": "MIT", "dependencies": { + "@opentelemetry/api": "^0.20.0", + "@opentelemetry/tracing": "^0.20.0", "apollo-graphql": "^0.9.3", "graphql-tag": "^2.10.4" } @@ -87,11 +90,12 @@ }, "gateway-js": { "name": "@apollo/gateway", - "version": "0.28.2", + "version": "0.28.3", "license": "MIT", "dependencies": { "@apollo/federation": "file:../federation-js", "@apollo/query-planner": "file:../query-planner-js", + "@opentelemetry/api": "^0.20.0", "@types/node-fetch": "2.5.10", "apollo-graphql": "^0.9.3", "apollo-reporting-protobuf": "^0.8.0", @@ -137,7 +141,7 @@ }, "harmonizer": { "name": "@apollo/harmonizer", - "version": "0.2.5", + "version": "0.3.2", "license": "MIT", "dependencies": { "@apollo/federation": "file:../federation-js" @@ -5687,6 +5691,160 @@ "@octokit/openapi-types": "^7.0.0" } }, + "node_modules/@opentelemetry/api": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-0.20.0.tgz", + "integrity": "sha512-n06MtDYEc2H07S/NTvGMlxF2Ijp0YbNrI/rBgLcxpEh3hxOkPZA12gxlUoZkBHWCZYau2j3b/uL+QFpiQKOjSw==", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/context-async-hooks": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-async-hooks/-/context-async-hooks-0.20.0.tgz", + "integrity": "sha512-4cuTIPpufWRDdShtvT0c30/jHfO9eXzUh6tU087J8aO8J/hckyCIlN03eB7pfqPwQzLnWONGdHOpGjLSY7q4tg==", + "dev": true, + "engines": { + "node": ">=8.1.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/core": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-0.20.0.tgz", + "integrity": "sha512-09zQqB4vp2jcyBnglA/TFklDQoVgWrFKtr9pDm0q3Oa1bD2Hwpq+JapBAw18YdMQsLNQM/qsXhFlS3gFDVEy4A==", + "dependencies": { + "@opentelemetry/semantic-conventions": "0.20.0", + "semver": "^7.1.3" + }, + "engines": { + "node": ">=8.5.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/core/node_modules/semver": { + "version": "7.3.5", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz", + "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==", + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@opentelemetry/node": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/node/-/node-0.20.0.tgz", + "integrity": "sha512-MVwnH/AoHQTz1jOhJTYXoAoQD4CA/3L7QQkiiA93f6QGaWKIHjI/+3fUtA/GCfPR9Kf0sItQ/aag8KtCJClPCw==", + "dev": true, + "dependencies": { + "@opentelemetry/context-async-hooks": "0.20.0", + "@opentelemetry/core": "0.20.0", + "@opentelemetry/propagator-b3": "0.20.0", + "@opentelemetry/propagator-jaeger": "0.20.0", + "@opentelemetry/tracing": "0.20.0", + "semver": "^7.1.3" + }, + "engines": { + "node": ">=8.0.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/node/node_modules/semver": { + "version": "7.3.5", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz", + "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@opentelemetry/propagator-b3": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-b3/-/propagator-b3-0.20.0.tgz", + "integrity": "sha512-TGI2D45oUVlbXVEWDedqxwO0WUtzchN/tuYghEHjRTNcVLLKT2ci9JwzHormC+ls98SYPDfvuzpB0+ParoexPQ==", + "dev": true, + "dependencies": { + "@opentelemetry/core": "0.20.0" + }, + "engines": { + "node": ">=8.0.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/propagator-jaeger": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-jaeger/-/propagator-jaeger-0.20.0.tgz", + "integrity": "sha512-6kdnd1ePADx4XDaFw4Ea47fdIZohhJbd30Fc4yvl0DO+RR5WEAiAho0IsoTj6L/qvOb/+LAfvjvdk2UOXgb/3Q==", + "dev": true, + "dependencies": { + "@opentelemetry/core": "0.20.0" + }, + "engines": { + "node": ">=8.5.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/resources": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-0.20.0.tgz", + "integrity": "sha512-nnd3vIM+A9ih6kOVBc2CF5NkTYmdNci5aQ+A5lQjf3HqjEptcGubpg1J1Q84LSFVoITvuH2O6+GhPBrdxYBt8g==", + "dependencies": { + "@opentelemetry/core": "0.20.0", + "@opentelemetry/semantic-conventions": "0.20.0" + }, + "engines": { + "node": ">=8.0.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, + "node_modules/@opentelemetry/semantic-conventions": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-0.20.0.tgz", + "integrity": "sha512-x40C3vQMttFlnNEfhFwO49jHrY6AoWnntL35TCem3LINr/aw1W0hGhdKY/zweC64CBJEyiHumaae480rqF8eOA==", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/tracing": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/tracing/-/tracing-0.20.0.tgz", + "integrity": "sha512-8ZIH0IBxIucgza0BFNiCCLByUsvu45Dm5k292RlO/E8Z1q/J7otJmh9r/EkaFb0ZSyjNdawmJ1CXnlU7+IQN1w==", + "dependencies": { + "@opentelemetry/core": "0.20.0", + "@opentelemetry/resources": "0.20.0", + "@opentelemetry/semantic-conventions": "0.20.0", + "lodash.merge": "^4.6.2" + }, + "engines": { + "node": ">=8.0.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^0.20.0" + } + }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -13844,6 +14002,11 @@ "integrity": "sha1-1SfftUVuynzJu5XV2ur4i6VKVFE=", "dev": true }, + "node_modules/lodash.merge": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", + "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" + }, "node_modules/lodash.once": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", @@ -19925,7 +20088,7 @@ }, "query-planner-js": { "name": "@apollo/query-planner", - "version": "0.2.0", + "version": "0.2.1", "license": "MIT", "dependencies": { "chalk": "^4.1.0", @@ -19953,6 +20116,7 @@ "requires": { "@apollo/federation": "file:../federation-js", "@apollo/query-planner": "file:../query-planner-js", + "@opentelemetry/api": "^0.20.0", "@types/node-fetch": "2.5.10", "apollo-graphql": "^0.9.3", "apollo-reporting-protobuf": "^0.8.0", @@ -24911,6 +25075,105 @@ "@octokit/openapi-types": "^7.0.0" } }, + "@opentelemetry/api": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-0.20.0.tgz", + "integrity": "sha512-n06MtDYEc2H07S/NTvGMlxF2Ijp0YbNrI/rBgLcxpEh3hxOkPZA12gxlUoZkBHWCZYau2j3b/uL+QFpiQKOjSw==" + }, + "@opentelemetry/context-async-hooks": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-async-hooks/-/context-async-hooks-0.20.0.tgz", + "integrity": "sha512-4cuTIPpufWRDdShtvT0c30/jHfO9eXzUh6tU087J8aO8J/hckyCIlN03eB7pfqPwQzLnWONGdHOpGjLSY7q4tg==", + "dev": true, + "requires": {} + }, + "@opentelemetry/core": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-0.20.0.tgz", + "integrity": "sha512-09zQqB4vp2jcyBnglA/TFklDQoVgWrFKtr9pDm0q3Oa1bD2Hwpq+JapBAw18YdMQsLNQM/qsXhFlS3gFDVEy4A==", + "requires": { + "@opentelemetry/semantic-conventions": "0.20.0", + "semver": "^7.1.3" + }, + "dependencies": { + "semver": { + "version": "7.3.5", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz", + "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==", + "requires": { + "lru-cache": "^6.0.0" + } + } + } + }, + "@opentelemetry/node": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/node/-/node-0.20.0.tgz", + "integrity": "sha512-MVwnH/AoHQTz1jOhJTYXoAoQD4CA/3L7QQkiiA93f6QGaWKIHjI/+3fUtA/GCfPR9Kf0sItQ/aag8KtCJClPCw==", + "dev": true, + "requires": { + "@opentelemetry/context-async-hooks": "0.20.0", + "@opentelemetry/core": "0.20.0", + "@opentelemetry/propagator-b3": "0.20.0", + "@opentelemetry/propagator-jaeger": "0.20.0", + "@opentelemetry/tracing": "0.20.0", + "semver": "^7.1.3" + }, + "dependencies": { + "semver": { + "version": "7.3.5", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz", + "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==", + "dev": true, + "requires": { + "lru-cache": "^6.0.0" + } + } + } + }, + "@opentelemetry/propagator-b3": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-b3/-/propagator-b3-0.20.0.tgz", + "integrity": "sha512-TGI2D45oUVlbXVEWDedqxwO0WUtzchN/tuYghEHjRTNcVLLKT2ci9JwzHormC+ls98SYPDfvuzpB0+ParoexPQ==", + "dev": true, + "requires": { + "@opentelemetry/core": "0.20.0" + } + }, + "@opentelemetry/propagator-jaeger": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-jaeger/-/propagator-jaeger-0.20.0.tgz", + "integrity": "sha512-6kdnd1ePADx4XDaFw4Ea47fdIZohhJbd30Fc4yvl0DO+RR5WEAiAho0IsoTj6L/qvOb/+LAfvjvdk2UOXgb/3Q==", + "dev": true, + "requires": { + "@opentelemetry/core": "0.20.0" + } + }, + "@opentelemetry/resources": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-0.20.0.tgz", + "integrity": "sha512-nnd3vIM+A9ih6kOVBc2CF5NkTYmdNci5aQ+A5lQjf3HqjEptcGubpg1J1Q84LSFVoITvuH2O6+GhPBrdxYBt8g==", + "requires": { + "@opentelemetry/core": "0.20.0", + "@opentelemetry/semantic-conventions": "0.20.0" + } + }, + "@opentelemetry/semantic-conventions": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-0.20.0.tgz", + "integrity": "sha512-x40C3vQMttFlnNEfhFwO49jHrY6AoWnntL35TCem3LINr/aw1W0hGhdKY/zweC64CBJEyiHumaae480rqF8eOA==" + }, + "@opentelemetry/tracing": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/tracing/-/tracing-0.20.0.tgz", + "integrity": "sha512-8ZIH0IBxIucgza0BFNiCCLByUsvu45Dm5k292RlO/E8Z1q/J7otJmh9r/EkaFb0ZSyjNdawmJ1CXnlU7+IQN1w==", + "requires": { + "@opentelemetry/core": "0.20.0", + "@opentelemetry/resources": "0.20.0", + "@opentelemetry/semantic-conventions": "0.20.0", + "lodash.merge": "^4.6.2" + } + }, "@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -25649,6 +25912,8 @@ "apollo-federation-integration-testsuite": { "version": "file:federation-integration-testsuite-js", "requires": { + "@opentelemetry/api": "^0.20.0", + "@opentelemetry/tracing": "^0.20.0", "apollo-graphql": "^0.9.3", "graphql-tag": "^2.10.4" } @@ -31688,6 +31953,11 @@ "integrity": "sha1-1SfftUVuynzJu5XV2ur4i6VKVFE=", "dev": true }, + "lodash.merge": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", + "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" + }, "lodash.once": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", diff --git a/package.json b/package.json index 4fbaeaad1..64590b924 100644 --- a/package.json +++ b/package.json @@ -68,6 +68,7 @@ "mocked-env": "1.3.4", "nock": "13.1.0", "node-fetch": "2.6.1", + "@opentelemetry/node": "^0.20.0", "prettier": "2.3.0", "rollup": "2.51.1", "rollup-plugin-node-polyfills": "0.2.1",