From 4e819e093976906f4ce7581f93d646f107e36ee9 Mon Sep 17 00:00:00 2001 From: Lenny Burdette Date: Mon, 5 Aug 2019 08:10:11 -0700 Subject: [PATCH 1/3] feat: add support for federated tracing Federated GraphQL services should include timing and error information as a Base64-encoded protocol buffer message in the `"extensions.ftv1"` field. The gateway requests traces by adding a special header to the GraphQL request, and combines traces from all federated services into a single trace. This change includes a Tracer that uses the graphql-ruby [tracing API][t] to record field timings and info and store it on the execution context. It also includes methods on the `ApolloFederation::Tracing` module to pluck the info from the context, convert it to an encoded string, and attach it to the query result's extensions. I used the Apollo Server typescript code as reference: * https://github.com/apollographql/apollo-server/blob/master/packages/apollo-engine-reporting/src/federatedExtension.ts * https://github.com/apollographql/apollo-server/blob/master/packages/apollo-engine-reporting/src/treeBuilder.ts As well as an unfinished fork of apollo-tracing-ruby: * https://github.com/salsify/apollo-tracing-ruby/blob/feature/new-apollo-api/lib/apollo_tracing/tracer.rb * https://github.com/salsify/apollo-tracing-ruby/blob/feature/new-apollo-api/lib/apollo_tracing/trace_tree.rb Federated tracing documentation: https://www.apollographql.com/docs/apollo-server/federation/metrics/ Addresses #14 [t]:https://graphql-ruby.org/queries/tracing.html --- .rubocop.yml | 2 + Gemfile.lock | 1 + README.md | 24 + apollo-federation.gemspec | 2 + bin/generate-protos.sh | 15 + lib/apollo-federation.rb | 4 + lib/apollo-federation/tracing.rb | 50 ++ lib/apollo-federation/tracing/node_map.rb | 72 +++ lib/apollo-federation/tracing/proto.rb | 12 + .../tracing/proto/apollo.proto | 488 ++++++++++++++++++ .../tracing/proto/apollo_pb.rb | 305 +++++++++++ lib/apollo-federation/tracing/tracer.rb | 151 ++++++ .../tracing/node_map_spec.rb | 71 +++ spec/apollo-federation/tracing_spec.rb | 258 +++++++++ 14 files changed, 1455 insertions(+) create mode 100755 bin/generate-protos.sh create mode 100644 lib/apollo-federation/tracing.rb create mode 100644 lib/apollo-federation/tracing/node_map.rb create mode 100644 lib/apollo-federation/tracing/proto.rb create mode 100644 lib/apollo-federation/tracing/proto/apollo.proto create mode 100644 lib/apollo-federation/tracing/proto/apollo_pb.rb create mode 100644 lib/apollo-federation/tracing/tracer.rb create mode 100644 spec/apollo-federation/tracing/node_map_spec.rb create mode 100644 spec/apollo-federation/tracing_spec.rb diff --git a/.rubocop.yml b/.rubocop.yml index 8f5d61ee2..f4e97d994 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -45,6 +45,8 @@ Metrics/ParameterLists: CountKeywordArgs: false Metrics/LineLength: Max: 100 + Exclude: + - lib/apollo-federation/tracing/proto/apollo_pb.rb Naming/FileName: Exclude: diff --git a/Gemfile.lock b/Gemfile.lock index cf916902e..436277886 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -34,6 +34,7 @@ GEM diff-lcs (1.3) erubi (1.8.0) graphql (1.9.11) + google-protobuf (3.9.0) i18n (1.6.0) concurrent-ruby (~> 1.0) jaro_winkler (1.5.3) diff --git a/README.md b/README.md index 7f6f88148..43bc1a383 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,30 @@ class User < BaseObject end ``` +### Tracing + +To support [federated tracing](https://www.apollographql.com/docs/apollo-server/federation/metrics/): + +1. Add `use ApolloFederation::Tracing` to your schema class. +2. Change your controller to add `tracing_enabled: true` to the execution context based on the presence of the "include trace" header: + ```ruby + def execute + # ... + context = { + tracing_enabled: ApolloFederation::Tracing.should_add_traces(headers) + } + # ... + end + ``` +3. Change your controller to attach the traces to the response: + ```ruby + def execute + # ... + result = YourSchema.execute(query, ...) + render json: ApolloFederation::Tracing.attach_trace_to_result(result) + end + ``` + ## Known Issues and Limitations - Currently only works with class-based schemas - Does not add directives to the output of `Schema.to_definition`. Since `graphql-ruby` doesn't natively support schema directives, the directives will only be visible to the [Apollo Gateway](https://www.apollographql.com/docs/apollo-server/api/apollo-gateway/) through the `Query._service` field (see the [Apollo Federation specification](https://www.apollographql.com/docs/apollo-server/federation/federation-spec/)) diff --git a/apollo-federation.gemspec b/apollo-federation.gemspec index b6d14d188..eeff52c32 100644 --- a/apollo-federation.gemspec +++ b/apollo-federation.gemspec @@ -28,6 +28,8 @@ Gem::Specification.new do |spec| spec.add_dependency 'graphql' + spec.add_runtime_dependency 'google-protobuf' + spec.add_development_dependency 'actionpack' spec.add_development_dependency 'pry-byebug' spec.add_development_dependency 'rack' diff --git a/bin/generate-protos.sh b/bin/generate-protos.sh new file mode 100755 index 000000000..3497c1b81 --- /dev/null +++ b/bin/generate-protos.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -eo pipefail + +DIR=`dirname "$0"` +OUTPUT_DIR=$DIR/../lib/apollo-federation/tracing/proto + +echo "Removing old client" +rm -f $OUTPUT_DIR/apollo.proto $OUTPUT_DIR/apollo_pb.rb + +echo "Downloading latest Apollo Protobuf IDL" +curl --silent --output lib/apollo-federation/tracing/proto/apollo.proto https://raw.githubusercontent.com/apollographql/apollo-server/master/packages/apollo-engine-reporting-protobuf/src/reports.proto + +echo "Generating Ruby client stubs" +protoc -I lib/apollo-federation/tracing/proto --ruby_out lib/apollo-federation/tracing/proto lib/apollo-federation/tracing/proto/apollo.proto diff --git a/lib/apollo-federation.rb b/lib/apollo-federation.rb index 5699857aa..5083a6c98 100644 --- a/lib/apollo-federation.rb +++ b/lib/apollo-federation.rb @@ -4,3 +4,7 @@ require 'apollo-federation/schema' require 'apollo-federation/object' require 'apollo-federation/field' +require 'apollo-federation/tracing/proto' +require 'apollo-federation/tracing/node_map' +require 'apollo-federation/tracing/tracer' +require 'apollo-federation/tracing' diff --git a/lib/apollo-federation/tracing.rb b/lib/apollo-federation/tracing.rb new file mode 100644 index 000000000..2c5c730f4 --- /dev/null +++ b/lib/apollo-federation/tracing.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +module ApolloFederation + module Tracing + KEY = :ftv1 + + def self.use(schema) + schema.tracer ApolloFederation::Tracing::Tracer + end + + def self.should_add_traces(headers) + headers && headers['apollo-federation-include-trace'] == KEY.to_s + end + + def self.attach_trace_to_result(result) + return result unless result.context[:tracing_enabled] + + trace = result.context.namespace(KEY) + unless trace[:start_time] + raise StandardError.new, 'Apollo Federation Tracing not installed. \ + Add `use ApollFederation::Tracing` to your schema.' + end + + result['errors']&.each do |error| + trace[:node_map].add_error(error) + end + + proto = ApolloFederation::Tracing::Trace.new( + start_time: to_proto_timestamp(trace[:start_time]), + end_time: to_proto_timestamp(trace[:end_time]), + duration_ns: trace[:end_time_nanos] - trace[:start_time_nanos], + root: trace[:node_map].root, + ) + + json = result.to_h + result[:extensions] ||= {} + result[:extensions][KEY] = Base64.encode64(proto.class.encode(proto)) + + if result.context[:debug_tracing] + result[:extensions]["#{KEY}_debug".to_sym] = proto.to_h + end + + json + end + + def self.to_proto_timestamp(time) + Google::Protobuf::Timestamp.new(seconds: time.to_i, nanos: time.nsec) + end + end +end diff --git a/lib/apollo-federation/tracing/node_map.rb b/lib/apollo-federation/tracing/node_map.rb new file mode 100644 index 000000000..97b74abbe --- /dev/null +++ b/lib/apollo-federation/tracing/node_map.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +require 'active_support/core_ext/array/wrap' +require 'apollo-federation/tracing/proto' + +module ApolloFederation + module Tracing + # NodeMap stores a flat map of trace nodes by stringified paths + # (i.e. "_entities.0.id") for fast lookup when we need to alter + # nodes (to add end times or errors.) + # + # When adding a node to the NodeMap, it will create any missing + # parent nodes and ensure the tree is consistent. + # + # Only the "root" node is attached to the trace extension. + class NodeMap + ROOT_KEY = '' + + attr_reader :nodes + def initialize + @nodes = { + ROOT_KEY => ApolloFederation::Tracing::Node.new, + } + end + + def root + nodes[ROOT_KEY] + end + + def node_for_path(path) + nodes[Array.wrap(path).join('.')] + end + + def add(path) + node = ApolloFederation::Tracing::Node.new + node_key = path.join('.') + key = path.last + + case key + when String # field + node.response_name = key + when Integer # index of an array + node.index = key + end + + nodes[node_key] = node + + # find or create a parent node and add this node to its children + parent_path = path[0..-2] + parent_node = nodes[parent_path.join('.')] || add(parent_path) + parent_node.child << node + + node + end + + def add_error(error) + path = Array.wrap(error['path']).join('.') + node = nodes[path] || root + + locations = Array.wrap(error['locations']).map do |location| + ApolloFederation::Tracing::Location.new(location) + end + + node.error << ApolloFederation::Tracing::Error.new( + message: error['message'], + location: locations, + json: JSON.dump(error), + ) + end + end + end +end diff --git a/lib/apollo-federation/tracing/proto.rb b/lib/apollo-federation/tracing/proto.rb new file mode 100644 index 000000000..cce8dff4d --- /dev/null +++ b/lib/apollo-federation/tracing/proto.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +require_relative 'proto/apollo_pb' + +module ApolloFederation + module Tracing + Trace = ::Mdg::Engine::Proto::Trace + Node = ::Mdg::Engine::Proto::Trace::Node + Location = ::Mdg::Engine::Proto::Trace::Location + Error = ::Mdg::Engine::Proto::Trace::Error + end +end diff --git a/lib/apollo-federation/tracing/proto/apollo.proto b/lib/apollo-federation/tracing/proto/apollo.proto new file mode 100644 index 000000000..55cb2b39c --- /dev/null +++ b/lib/apollo-federation/tracing/proto/apollo.proto @@ -0,0 +1,488 @@ +syntax = "proto3"; + +package mdg.engine.proto; + +import "google/protobuf/timestamp.proto"; + +message Trace { + message CachePolicy { + enum Scope { + UNKNOWN = 0; + PUBLIC = 1; + PRIVATE = 2; + } + + Scope scope = 1; + int64 max_age_ns = 2; // use 0 for absent, -1 for 0 + } + + message Details { + // The variables associated with this query (unless the reporting agent is + // configured to keep them all private). Values are JSON: ie, strings are + // enclosed in double quotes, etc. The value of a private variable is + // the empty string. + map variables_json = 4; + // Deprecated. Engineproxy did not encode variable values as JSON, so you + // couldn't tell numbers from numeric strings. Send variables_json instead. + map variables = 1; + // Optional: this is the original full query before the signature algorithm + // is applied. Engineproxy always sent this in all traces, which meant that + // literal-masking done by the signature algorithm didn't fully hide + // sensitive data from Engine servers. apollo-engine-reporting does not + // include this by default. (The Engine frontend does not currently show + // this field.) + string raw_query = 2; + // Don't include this in traces inside a FullTracesReport; the operation + // name for these traces comes from the key of the traces_per_query map. + string operation_name = 3; + } + + message Error { + string message = 1; // required + repeated Location location = 2; + uint64 time_ns = 3; + string json = 4; + } + + message HTTP { + message Values { + repeated string value = 1; + } + + enum Method { + UNKNOWN = 0; + OPTIONS = 1; + GET = 2; + HEAD = 3; + POST = 4; + PUT = 5; + DELETE = 6; + TRACE = 7; + CONNECT = 8; + PATCH = 9; + } + Method method = 1; + string host = 2; + string path = 3; + + // Should exclude manual blacklist ("Auth" by default) + map request_headers = 4; + map response_headers = 5; + + uint32 status_code = 6; + + bool secure = 8; // TLS was used + string protocol = 9; // by convention "HTTP/1.0", "HTTP/1.1", "HTTP/2" or "h2" + } + + message Location { + uint32 line = 1; + uint32 column = 2; + } + + // We store information on each resolver execution as a Node on a tree. + // The structure of the tree corresponds to the structure of the GraphQL + // response; it does not indicate the order in which resolvers were + // invoked. Note that nodes representing indexes (and the root node) + // don't contain all Node fields (eg types and times). + message Node { + // The name of the field (for Nodes representing a resolver call) or the + // index in a list (for intermediate Nodes representing elements of a list). + // field_name is the name of the field as it appears in the GraphQL + // response: ie, it may be an alias. (In that case, the original_field_name + // field holds the actual field name from the schema.) In any context where + // we're building up a path, we use the response_name rather than the + // original_field_name. + oneof id { + string response_name = 1; + uint32 index = 2; + } + + string original_field_name = 14; + + // The field's return type; e.g. "String!" for User.email:String! + string type = 3; + + // The field's parent type; e.g. "User" for User.email:String! + string parent_type = 13; + + CachePolicy cache_policy = 5; + + // relative to the trace's start_time, in ns + uint64 start_time = 8; + // relative to the trace's start_time, in ns + uint64 end_time = 9; + + repeated Error error = 11; + repeated Node child = 12; + + reserved 4; + } + + // represents a node in the query plan, under which there is a trace tree for that service fetch. + // In particular, each fetch node represents a call to an implementing service, and calls to implementing + // services may not be unique. See https://github.com/apollographql/apollo-server/blob/master/packages/apollo-gateway/src/QueryPlan.ts + // for more information and details. + message QueryPlanNode { + // This represents a set of nodes to be executed sequentially by the Gateway executor + message SequenceNode { + repeated QueryPlanNode nodes = 1; + } + // This represents a set of nodes to be executed in parallel by the Gateway executor + message ParallelNode { + repeated QueryPlanNode nodes = 1; + } + // This represents a node to send an operation to an implementing service + message FetchNode { + // XXX When we want to include more details about the sub-operation that was + // executed against this service, we should include that here in each fetch node. + // This might include an operation signature, requires directive, reference resolutions, etc. + string serviceName = 1; + + bool traceParsingFailed = 2; + + // This Trace only contains start_time, end_time, duration_ns, and root; + // all timings were calculated **on the federated service**, and clock skew + // will be handled by the ingress server. + Trace trace = 3; + + // relative to the outer trace's start_time, in ns, measured in the gateway. + uint64 sent_time_offset = 4; + + // Wallclock times measured in the gateway for when this operation was + // sent and received. + google.protobuf.Timestamp sent_time = 5; + google.protobuf.Timestamp received_time = 6; + } + + // This node represents a way to reach into the response path and attach related entities. + // XXX Flatten is really not the right name and this node may be renamed in the query planner. + message FlattenNode { + repeated ResponsePathElement response_path = 1; + QueryPlanNode node = 2; + } + message ResponsePathElement { + oneof id { + string field_name = 1; + uint32 index = 2; + } + } + oneof node { + SequenceNode sequence = 1; + ParallelNode parallel = 2; + FetchNode fetch = 3; + FlattenNode flatten = 4; + } + } + + // Wallclock time when the trace began. + google.protobuf.Timestamp start_time = 4; // required + // Wallclock time when the trace ended. + google.protobuf.Timestamp end_time = 3; // required + // High precision duration of the trace; may not equal end_time-start_time + // (eg, if your machine's clock changed during the trace). + uint64 duration_ns = 11; // required + // A tree containing information about all resolvers run directly by this + // service, including errors. + Node root = 14; + + // ------------------------------------------------------------------------- + // Fields below this line are *not* included in federated traces (the traces + // sent from federated services to the gateway). + + // In addition to details.raw_query, we include a "signature" of the query, + // which can be normalized: for example, you may want to discard aliases, drop + // unused operations and fragments, sort fields, etc. The most important thing + // here is that the signature match the signature in StatsReports. In + // StatsReports signatures show up as the key in the per_query map (with the + // operation name prepended). The signature should be a valid GraphQL query. + // All traces must have a signature; if this Trace is in a FullTracesReport + // that signature is in the key of traces_per_query rather than in this field. + // Engineproxy provides the signature in legacy_signature_needs_resigning + // instead. + string signature = 19; + + Details details = 6; + + // Note: engineproxy always sets client_name, client_version, and client_address to "none". + // apollo-engine-reporting allows for them to be set by the user. + string client_name = 7; + string client_version = 8; + string client_address = 9; + string client_reference_id = 23; + + HTTP http = 10; + + CachePolicy cache_policy = 18; + + // If this Trace was created by a gateway, this is the query plan, including + // sub-Traces for federated services. Note that the 'root' tree on the + // top-level Trace won't contain any resolvers (though it could contain errors + // that occurred in the gateway itself). + QueryPlanNode query_plan = 26; + + // Was this response served from a full query response cache? (In that case + // the node tree will have no resolvers.) + bool full_query_cache_hit = 20; + + // Was this query specified successfully as a persisted query hash? + bool persisted_query_hit = 21; + // Did this query contain both a full query string and a persisted query hash? + // (This typically means that a previous request was rejected as an unknown + // persisted query.) + bool persisted_query_register = 22; + + // Was this operation registered and a part of the safelist? + bool registered_operation = 24; + + // Was this operation forbidden due to lack of safelisting? + bool forbidden_operation = 25; + + // -------------------------------------------------------------- + // Fields below this line are only set by the old Go engineproxy. + google.protobuf.Timestamp origin_reported_start_time = 15; + google.protobuf.Timestamp origin_reported_end_time = 16; + uint64 origin_reported_duration_ns = 17; + + // Older agents (eg the Go engineproxy) relied to some degree on the Engine + // backend to run their own semi-compatible implementation of a specific + // variant of query signatures. The backend does not do this for new agents (which + // set the above 'signature' field). It used to still "re-sign" signatures + // from engineproxy, but we've now simplified the backend to no longer do this. + // Deprecated and ignored in FullTracesReports. + string legacy_signature_needs_resigning = 5; + + + // removed: Node parse = 12; Node validate = 13; + // Id128 server_id = 1; Id128 client_id = 2; + reserved 12, 13, 1, 2; +} + +// The `service` value embedded within the header key is not guaranteed to contain an actual service, +// and, in most cases, the service information is trusted to come from upstream processing. If the +// service _is_ specified in this header, then it is checked to match the context that is reporting it. +// Otherwise, the service information is deduced from the token context of the reporter and then sent +// along via other mechanisms (in Kafka, the `ReportKafkaKey). The other information (hostname, +// agent_version, etc.) is sent by the Apollo Engine Reporting agent, but we do not currently save that +// information to any of our persistent storage. +message ReportHeader { + string service = 3; + // eg "host-01.example.com" + string hostname = 5; + + // eg "engineproxy 0.1.0" + string agent_version = 6; // required + // eg "prod-4279-20160804T065423Z-5-g3cf0aa8" (taken from `git describe --tags`) + string service_version = 7; + // eg "node v4.6.0" + string runtime_version = 8; + // eg "Linux box 4.6.5-1-ec2 #1 SMP Mon Aug 1 02:31:38 PDT 2016 x86_64 GNU/Linux" + string uname = 9; + // eg "current", "prod" + string schema_tag = 10; + // The hex representation of the sha512 of the introspection response + string schema_hash = 11; +} + +message PathErrorStats { + map children = 1; + uint64 errors_count = 4; + uint64 requests_with_errors_count = 5; +} + +message ClientNameStats { + // Duration histogram for non-cache-hit queries. + // (See docs/histograms.md for the histogram format.) + repeated int64 latency_count = 1; + reserved 2; // removed: repeated uint64 error_count = 2; + // These per-version fields were used to understand what versions contributed to this sample + // when we were implementing the aggregation of this information ourselves using BigTable. + // However, since the per-version stats don't separate out latency, it makes more sense to + // have stats reported with contextual information so we can have the specific breakdown we're + // looking for. These fields are somewhat misleading as we never actually do any per-version + // awareness with anything reporting in the legacy "per_client_name" stats, and instead use + // "query_stats_with_context" to have more contextual information. + map requests_count_per_version = 3; // required + map cache_hits_per_version = 4; + map persisted_query_hits_per_version = 10; + map persisted_query_misses_per_version = 11; + map registered_operation_count_per_version = 12; + map forbidden_operation_count_per_version = 13; + repeated int64 cache_latency_count = 5; // Duration histogram; see docs/histograms.md + PathErrorStats root_error_stats = 6; + uint64 requests_with_errors_count = 7; + // TTL histograms for cache misses for the public cache. + repeated int64 public_cache_ttl_count = 8; + // TTL histograms for cache misses for the private cache. + repeated int64 private_cache_ttl_count = 9; +} + +message QueryLatencyStats { + repeated int64 latency_count = 1; + uint64 request_count = 2; + uint64 cache_hits = 3; + uint64 persisted_query_hits = 4; + uint64 persisted_query_misses = 5; + repeated int64 cache_latency_count = 6; + PathErrorStats root_error_stats = 7; + uint64 requests_with_errors_count = 8; + repeated int64 public_cache_ttl_count = 9; + repeated int64 private_cache_ttl_count = 10; + uint64 registered_operation_count = 11; + uint64 forbidden_operation_count = 12; +} + +message StatsContext { + string client_reference_id = 1; + string client_name = 2; + string client_version = 3; +} + +message ContextualizedQueryLatencyStats { + QueryLatencyStats query_latency_stats = 1; + StatsContext context = 2; +} + +message ContextualizedTypeStats { + StatsContext context = 1; + map per_type_stat = 2; +} + +message FieldStat { + string name = 2; // deprecated; only set when stored in TypeStat.field + string return_type = 3; // required; eg "String!" for User.email:String! + uint64 errors_count = 4; + uint64 count = 5; + uint64 requests_with_errors_count = 6; + repeated int64 latency_count = 8; // Duration histogram; see docs/histograms.md +} + +message TypeStat { + string name = 1; // deprecated; only set when stored in QueryStats.per_type + repeated FieldStat field = 2; // deprecated; use per_field_stat instead + // Key is (eg) "email" for User.email:String! + map per_field_stat = 3; +} + +message QueryStats { + // Either per_client_name (for back-compat) or query_stats_with_context must be specified. If both are + // specified, then query_stats_with_context will be used and per_client_name will be ignored. Although + // the fields in ClientNameStats mention things "per-version," the information in the "per-version" + // fields will only ever be over the default version, the empty String: "", if arrived at via the + // FullTracesAggregator. + map per_client_name = 1; // deprecated; use stats_with_context instead + repeated ContextualizedQueryLatencyStats query_stats_with_context = 4; + repeated TypeStat per_type = 2; // deprecated; use type_stats_with_context instead + // Key is the parent type, e.g. "User" for User.email:String! + map per_type_stat = 3; // deprecated; use type_stats_with_context instead + repeated ContextualizedTypeStats type_stats_with_context = 5; +} + +// Top-level message type for the server-side traces endpoint +message TracesReport { + ReportHeader header = 1; // required + repeated Trace trace = 2; // required +} + +message Field { + string name = 2; // required; eg "email" for User.email:String! + string return_type = 3; // required; eg "String!" for User.email:String! +} + +message Type { + string name = 1; // required; eg "User" for User.email:String! + repeated Field field = 2; +} + +message MemStats { + uint64 total_bytes = 1; // MemStats.Sys + uint64 stack_bytes = 2; // MemStats.StackSys + uint64 heap_bytes = 3; // MemStats.HeapSys + uint64 heap_released_bytes = 13; // MemStats.HeapReleased + uint64 gc_overhead_bytes = 4; // MemStats.GCSys + + uint64 stack_used_bytes = 5; // MemStats.StackInuse + uint64 heap_allocated_bytes = 6; // MemStats.HeapAlloc + uint64 heap_allocated_objects = 7; // MemStats.HeapObjects + + uint64 heap_allocated_bytes_delta = 8; // MemStats.TotalAlloc delta + uint64 heap_allocated_objects_delta = 9; // MemStats.Mallocs delta + uint64 heap_freed_objects_delta = 10; // MemStats.Frees delta + + uint64 gc_stw_ns_delta = 11; // MemStats.PauseTotalNs delta + uint64 gc_count_delta = 12; // MemStats.NumGC delta +} + +message TimeStats { + uint64 uptime_ns = 1; + uint64 real_ns_delta = 2; + uint64 user_ns_delta = 3; + uint64 sys_ns_delta = 4; +} + +// Top-level message type for the server-side stats endpoint +message StatsReport { + ReportHeader header = 1; // required + + // These fields are about properties of the engineproxy and are not generated + // from FullTracesReports. + MemStats mem_stats = 2; + TimeStats time_stats = 3; + + // Beginning of the period over which stats are collected. + google.protobuf.Timestamp start_time = 8; + // End of the period of which stats are collected. + google.protobuf.Timestamp end_time = 9; + // Only used to interpret mem_stats and time_stats; not generated from + // FullTracesReports. + uint64 realtime_duration = 10; + + + // Maps from query descriptor to QueryStats. Required unless + // legacy_per_query_missing_operation_name is set. The keys are strings of the + // form `# operationName\nsignature` (literal hash and space), with + // operationName - if there is no operation name. + map per_query = 14; + + // Older agents (Go engineproxy) didn't explicitly include the operation name + // in the key of this map, and the server had to parse it out (after a + // re-signing operation which is no longer performed). The key here is just the query + // signature. Deprecated. + map legacy_per_query_implicit_operation_name = 12; + + // Deprecated: it was useful in Optics where we had access to the whole schema + // but has not been ever used in Engine. apollo-engine-reporting will not + // send it. + repeated Type type = 13; +} + +// This is the top-level message used by the new traces ingress. This +// is designed for the apollo-engine-reporting TypeScript agent and will +// eventually be documented as a public ingress API. This message consists +// solely of traces; the equivalent of the StatsReport is automatically +// generated server-side from this message. Agents should send traces +// for all requests in this report. Generally, buffering up until a large +// size has been reached (say, 4MB) or 5-10 seconds has passed is appropriate. +message FullTracesReport { + ReportHeader header = 1; + + // key is statsReportKey (# operationName\nsignature) Note that the nested + // traces will *not* have a signature or details.operationName (because the + // key is adequate). + // + // We also assume that traces don't have + // legacy_per_query_implicit_operation_name, and we don't require them to have + // details.raw_query (which would consume a lot of space and has privacy/data + // access issues, and isn't currently exposed by our app anyway). + map traces_per_query = 5; +} + +// Just a sequence of traces with the same statsReportKey. +message Traces { + repeated Trace trace = 1; +} + +message TraceV1 { + ReportHeader header = 1; + Trace trace = 2; +} diff --git a/lib/apollo-federation/tracing/proto/apollo_pb.rb b/lib/apollo-federation/tracing/proto/apollo_pb.rb new file mode 100644 index 000000000..196eefa42 --- /dev/null +++ b/lib/apollo-federation/tracing/proto/apollo_pb.rb @@ -0,0 +1,305 @@ +# frozen_string_literal: true + +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: apollo.proto + +require 'google/protobuf' + +require 'google/protobuf/timestamp_pb' +Google::Protobuf::DescriptorPool.generated_pool.build do + add_file('apollo.proto', syntax: :proto3) do + add_message 'mdg.engine.proto.Trace' do + optional :start_time, :message, 4, 'google.protobuf.Timestamp' + optional :end_time, :message, 3, 'google.protobuf.Timestamp' + optional :duration_ns, :uint64, 11 + optional :root, :message, 14, 'mdg.engine.proto.Trace.Node' + optional :signature, :string, 19 + optional :details, :message, 6, 'mdg.engine.proto.Trace.Details' + optional :client_name, :string, 7 + optional :client_version, :string, 8 + optional :client_address, :string, 9 + optional :client_reference_id, :string, 23 + optional :http, :message, 10, 'mdg.engine.proto.Trace.HTTP' + optional :cache_policy, :message, 18, 'mdg.engine.proto.Trace.CachePolicy' + optional :query_plan, :message, 26, 'mdg.engine.proto.Trace.QueryPlanNode' + optional :full_query_cache_hit, :bool, 20 + optional :persisted_query_hit, :bool, 21 + optional :persisted_query_register, :bool, 22 + optional :registered_operation, :bool, 24 + optional :forbidden_operation, :bool, 25 + optional :origin_reported_start_time, :message, 15, 'google.protobuf.Timestamp' + optional :origin_reported_end_time, :message, 16, 'google.protobuf.Timestamp' + optional :origin_reported_duration_ns, :uint64, 17 + optional :legacy_signature_needs_resigning, :string, 5 + end + add_message 'mdg.engine.proto.Trace.CachePolicy' do + optional :scope, :enum, 1, 'mdg.engine.proto.Trace.CachePolicy.Scope' + optional :max_age_ns, :int64, 2 + end + add_enum 'mdg.engine.proto.Trace.CachePolicy.Scope' do + value :UNKNOWN, 0 + value :PUBLIC, 1 + value :PRIVATE, 2 + end + add_message 'mdg.engine.proto.Trace.Details' do + map :variables_json, :string, :string, 4 + map :variables, :string, :bytes, 1 + optional :raw_query, :string, 2 + optional :operation_name, :string, 3 + end + add_message 'mdg.engine.proto.Trace.Error' do + optional :message, :string, 1 + repeated :location, :message, 2, 'mdg.engine.proto.Trace.Location' + optional :time_ns, :uint64, 3 + optional :json, :string, 4 + end + add_message 'mdg.engine.proto.Trace.HTTP' do + optional :method, :enum, 1, 'mdg.engine.proto.Trace.HTTP.Method' + optional :host, :string, 2 + optional :path, :string, 3 + map :request_headers, :string, :message, 4, 'mdg.engine.proto.Trace.HTTP.Values' + map :response_headers, :string, :message, 5, 'mdg.engine.proto.Trace.HTTP.Values' + optional :status_code, :uint32, 6 + optional :secure, :bool, 8 + optional :protocol, :string, 9 + end + add_message 'mdg.engine.proto.Trace.HTTP.Values' do + repeated :value, :string, 1 + end + add_enum 'mdg.engine.proto.Trace.HTTP.Method' do + value :UNKNOWN, 0 + value :OPTIONS, 1 + value :GET, 2 + value :HEAD, 3 + value :POST, 4 + value :PUT, 5 + value :DELETE, 6 + value :TRACE, 7 + value :CONNECT, 8 + value :PATCH, 9 + end + add_message 'mdg.engine.proto.Trace.Location' do + optional :line, :uint32, 1 + optional :column, :uint32, 2 + end + add_message 'mdg.engine.proto.Trace.Node' do + optional :original_field_name, :string, 14 + optional :type, :string, 3 + optional :parent_type, :string, 13 + optional :cache_policy, :message, 5, 'mdg.engine.proto.Trace.CachePolicy' + optional :start_time, :uint64, 8 + optional :end_time, :uint64, 9 + repeated :error, :message, 11, 'mdg.engine.proto.Trace.Error' + repeated :child, :message, 12, 'mdg.engine.proto.Trace.Node' + oneof :id do + optional :response_name, :string, 1 + optional :index, :uint32, 2 + end + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode' do + oneof :node do + optional :sequence, :message, 1, 'mdg.engine.proto.Trace.QueryPlanNode.SequenceNode' + optional :parallel, :message, 2, 'mdg.engine.proto.Trace.QueryPlanNode.ParallelNode' + optional :fetch, :message, 3, 'mdg.engine.proto.Trace.QueryPlanNode.FetchNode' + optional :flatten, :message, 4, 'mdg.engine.proto.Trace.QueryPlanNode.FlattenNode' + end + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode.SequenceNode' do + repeated :nodes, :message, 1, 'mdg.engine.proto.Trace.QueryPlanNode' + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode.ParallelNode' do + repeated :nodes, :message, 1, 'mdg.engine.proto.Trace.QueryPlanNode' + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode.FetchNode' do + optional :serviceName, :string, 1 + optional :traceParsingFailed, :bool, 2 + optional :trace, :message, 3, 'mdg.engine.proto.Trace' + optional :sent_time_offset, :uint64, 4 + optional :sent_time, :message, 5, 'google.protobuf.Timestamp' + optional :received_time, :message, 6, 'google.protobuf.Timestamp' + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode.FlattenNode' do + repeated :response_path, :message, 1, 'mdg.engine.proto.Trace.QueryPlanNode.ResponsePathElement' + optional :node, :message, 2, 'mdg.engine.proto.Trace.QueryPlanNode' + end + add_message 'mdg.engine.proto.Trace.QueryPlanNode.ResponsePathElement' do + oneof :id do + optional :field_name, :string, 1 + optional :index, :uint32, 2 + end + end + add_message 'mdg.engine.proto.ReportHeader' do + optional :service, :string, 3 + optional :hostname, :string, 5 + optional :agent_version, :string, 6 + optional :service_version, :string, 7 + optional :runtime_version, :string, 8 + optional :uname, :string, 9 + optional :schema_tag, :string, 10 + optional :schema_hash, :string, 11 + end + add_message 'mdg.engine.proto.PathErrorStats' do + map :children, :string, :message, 1, 'mdg.engine.proto.PathErrorStats' + optional :errors_count, :uint64, 4 + optional :requests_with_errors_count, :uint64, 5 + end + add_message 'mdg.engine.proto.ClientNameStats' do + repeated :latency_count, :int64, 1 + map :requests_count_per_version, :string, :uint64, 3 + map :cache_hits_per_version, :string, :uint64, 4 + map :persisted_query_hits_per_version, :string, :uint64, 10 + map :persisted_query_misses_per_version, :string, :uint64, 11 + map :registered_operation_count_per_version, :string, :uint64, 12 + map :forbidden_operation_count_per_version, :string, :uint64, 13 + repeated :cache_latency_count, :int64, 5 + optional :root_error_stats, :message, 6, 'mdg.engine.proto.PathErrorStats' + optional :requests_with_errors_count, :uint64, 7 + repeated :public_cache_ttl_count, :int64, 8 + repeated :private_cache_ttl_count, :int64, 9 + end + add_message 'mdg.engine.proto.QueryLatencyStats' do + repeated :latency_count, :int64, 1 + optional :request_count, :uint64, 2 + optional :cache_hits, :uint64, 3 + optional :persisted_query_hits, :uint64, 4 + optional :persisted_query_misses, :uint64, 5 + repeated :cache_latency_count, :int64, 6 + optional :root_error_stats, :message, 7, 'mdg.engine.proto.PathErrorStats' + optional :requests_with_errors_count, :uint64, 8 + repeated :public_cache_ttl_count, :int64, 9 + repeated :private_cache_ttl_count, :int64, 10 + optional :registered_operation_count, :uint64, 11 + optional :forbidden_operation_count, :uint64, 12 + end + add_message 'mdg.engine.proto.StatsContext' do + optional :client_reference_id, :string, 1 + optional :client_name, :string, 2 + optional :client_version, :string, 3 + end + add_message 'mdg.engine.proto.ContextualizedQueryLatencyStats' do + optional :query_latency_stats, :message, 1, 'mdg.engine.proto.QueryLatencyStats' + optional :context, :message, 2, 'mdg.engine.proto.StatsContext' + end + add_message 'mdg.engine.proto.ContextualizedTypeStats' do + optional :context, :message, 1, 'mdg.engine.proto.StatsContext' + map :per_type_stat, :string, :message, 2, 'mdg.engine.proto.TypeStat' + end + add_message 'mdg.engine.proto.FieldStat' do + optional :name, :string, 2 + optional :return_type, :string, 3 + optional :errors_count, :uint64, 4 + optional :count, :uint64, 5 + optional :requests_with_errors_count, :uint64, 6 + repeated :latency_count, :int64, 8 + end + add_message 'mdg.engine.proto.TypeStat' do + optional :name, :string, 1 + repeated :field, :message, 2, 'mdg.engine.proto.FieldStat' + map :per_field_stat, :string, :message, 3, 'mdg.engine.proto.FieldStat' + end + add_message 'mdg.engine.proto.QueryStats' do + map :per_client_name, :string, :message, 1, 'mdg.engine.proto.ClientNameStats' + repeated :query_stats_with_context, :message, 4, 'mdg.engine.proto.ContextualizedQueryLatencyStats' + repeated :per_type, :message, 2, 'mdg.engine.proto.TypeStat' + map :per_type_stat, :string, :message, 3, 'mdg.engine.proto.TypeStat' + repeated :type_stats_with_context, :message, 5, 'mdg.engine.proto.ContextualizedTypeStats' + end + add_message 'mdg.engine.proto.TracesReport' do + optional :header, :message, 1, 'mdg.engine.proto.ReportHeader' + repeated :trace, :message, 2, 'mdg.engine.proto.Trace' + end + add_message 'mdg.engine.proto.Field' do + optional :name, :string, 2 + optional :return_type, :string, 3 + end + add_message 'mdg.engine.proto.Type' do + optional :name, :string, 1 + repeated :field, :message, 2, 'mdg.engine.proto.Field' + end + add_message 'mdg.engine.proto.MemStats' do + optional :total_bytes, :uint64, 1 + optional :stack_bytes, :uint64, 2 + optional :heap_bytes, :uint64, 3 + optional :heap_released_bytes, :uint64, 13 + optional :gc_overhead_bytes, :uint64, 4 + optional :stack_used_bytes, :uint64, 5 + optional :heap_allocated_bytes, :uint64, 6 + optional :heap_allocated_objects, :uint64, 7 + optional :heap_allocated_bytes_delta, :uint64, 8 + optional :heap_allocated_objects_delta, :uint64, 9 + optional :heap_freed_objects_delta, :uint64, 10 + optional :gc_stw_ns_delta, :uint64, 11 + optional :gc_count_delta, :uint64, 12 + end + add_message 'mdg.engine.proto.TimeStats' do + optional :uptime_ns, :uint64, 1 + optional :real_ns_delta, :uint64, 2 + optional :user_ns_delta, :uint64, 3 + optional :sys_ns_delta, :uint64, 4 + end + add_message 'mdg.engine.proto.StatsReport' do + optional :header, :message, 1, 'mdg.engine.proto.ReportHeader' + optional :mem_stats, :message, 2, 'mdg.engine.proto.MemStats' + optional :time_stats, :message, 3, 'mdg.engine.proto.TimeStats' + optional :start_time, :message, 8, 'google.protobuf.Timestamp' + optional :end_time, :message, 9, 'google.protobuf.Timestamp' + optional :realtime_duration, :uint64, 10 + map :per_query, :string, :message, 14, 'mdg.engine.proto.QueryStats' + map :legacy_per_query_implicit_operation_name, :string, :message, 12, 'mdg.engine.proto.QueryStats' + repeated :type, :message, 13, 'mdg.engine.proto.Type' + end + add_message 'mdg.engine.proto.FullTracesReport' do + optional :header, :message, 1, 'mdg.engine.proto.ReportHeader' + map :traces_per_query, :string, :message, 5, 'mdg.engine.proto.Traces' + end + add_message 'mdg.engine.proto.Traces' do + repeated :trace, :message, 1, 'mdg.engine.proto.Trace' + end + add_message 'mdg.engine.proto.TraceV1' do + optional :header, :message, 1, 'mdg.engine.proto.ReportHeader' + optional :trace, :message, 2, 'mdg.engine.proto.Trace' + end + end +end + +module Mdg + module Engine + module Proto + Trace = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace').msgclass + Trace::CachePolicy = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.CachePolicy').msgclass + Trace::CachePolicy::Scope = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.CachePolicy.Scope').enummodule + Trace::Details = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.Details').msgclass + Trace::Error = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.Error').msgclass + Trace::HTTP = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.HTTP').msgclass + Trace::HTTP::Values = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.HTTP.Values').msgclass + Trace::HTTP::Method = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.HTTP.Method').enummodule + Trace::Location = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.Location').msgclass + Trace::Node = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.Node').msgclass + Trace::QueryPlanNode = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode').msgclass + Trace::QueryPlanNode::SequenceNode = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode.SequenceNode').msgclass + Trace::QueryPlanNode::ParallelNode = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode.ParallelNode').msgclass + Trace::QueryPlanNode::FetchNode = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode.FetchNode').msgclass + Trace::QueryPlanNode::FlattenNode = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode.FlattenNode').msgclass + Trace::QueryPlanNode::ResponsePathElement = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Trace.QueryPlanNode.ResponsePathElement').msgclass + ReportHeader = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.ReportHeader').msgclass + PathErrorStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.PathErrorStats').msgclass + ClientNameStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.ClientNameStats').msgclass + QueryLatencyStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.QueryLatencyStats').msgclass + StatsContext = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.StatsContext').msgclass + ContextualizedQueryLatencyStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.ContextualizedQueryLatencyStats').msgclass + ContextualizedTypeStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.ContextualizedTypeStats').msgclass + FieldStat = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.FieldStat').msgclass + TypeStat = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.TypeStat').msgclass + QueryStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.QueryStats').msgclass + TracesReport = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.TracesReport').msgclass + Field = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Field').msgclass + Type = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Type').msgclass + MemStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.MemStats').msgclass + TimeStats = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.TimeStats').msgclass + StatsReport = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.StatsReport').msgclass + FullTracesReport = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.FullTracesReport').msgclass + Traces = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.Traces').msgclass + TraceV1 = Google::Protobuf::DescriptorPool.generated_pool.lookup('mdg.engine.proto.TraceV1').msgclass + end + end +end diff --git a/lib/apollo-federation/tracing/tracer.rb b/lib/apollo-federation/tracing/tracer.rb new file mode 100644 index 000000000..41a81ca9d --- /dev/null +++ b/lib/apollo-federation/tracing/tracer.rb @@ -0,0 +1,151 @@ +# frozen_string_literal: true + +module ApolloFederation + module Tracing + module Tracer + # store string constants to avoid creating new strings for each call to .trace + EXECUTE_QUERY = 'execute_query' + EXECUTE_QUERY_LAZY = 'execute_query_lazy' + EXECUTE_FIELD = 'execute_field' + EXECUTE_FIELD_LAZY = 'execute_field_lazy' + + def self.trace(key, data, &block) + case key + when EXECUTE_QUERY + execute_query(data, &block) + when EXECUTE_QUERY_LAZY + execute_query_lazy(data, &block) + when EXECUTE_FIELD + execute_field(data, &block) + when EXECUTE_FIELD_LAZY + execute_field_lazy(data, &block) + else + yield + end + end + + # Step 1: + # Create a trace hash on the query context and record start times. + def self.execute_query(data, &block) + query = data.fetch(:query) + return block.call unless query.context && query.context[:tracing_enabled] + + query.context.namespace(ApolloFederation::Tracing::KEY).merge!( + start_time: Time.now.utc, + start_time_nanos: Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond), + node_map: NodeMap.new, + ) + + block.call + end + + # Step 4: + # Record end times and merge them into the trace hash. + def self.execute_query_lazy(data, &block) + result = block.call + + query = data.fetch(:query) + return result unless query.context && query.context[:tracing_enabled] + + trace = query.context.namespace(ApolloFederation::Tracing::KEY) + + trace.merge!( + end_time: Time.now.utc, + end_time_nanos: Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond), + ) + + result + end + + # Step 2: + # * Record start and end times for the field resolver. + # * Rescue errors so the method doesn't exit early. + # * Create a trace "node" and attach field details. + # * Propagate the error (if necessary) so it ends up in the top-level errors array. + # + # The values in `data` are different depending on the executor runtime. + # https://graphql-ruby.org/api-doc/1.9.3/GraphQL/Tracing + # + # Nodes are added the NodeMap stored in the trace hash. + # + # Errors are added to nodes in `ApolloFederation::Tracing.attach_trace_to_result` + # because we don't have the error `location` here. + # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity + def self.execute_field(data, &block) + context = data.fetch(:context) || data.fetch(:query).context + return block.call unless context && context[:tracing_enabled] + + start_time_nanos = Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond) + + begin + result = block.call + rescue StandardError => e + error = e + end + + end_time_nanos = Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond) + + # interpreter runtime + if data.include?(:context) + path = context.path + field_name = context.field.graphql_name + field_type = context.field.type.to_s + parent_type = context.parent_type.graphql_name + else # legacy runtime + path = data.fetch(:path) + field_name = data.fetch(:field).graphql_name + field_type = data.fetch(:field).type.unwrap.graphql_name + parent_type = data.fetch(:owner).graphql_name + end + + trace = context.namespace(ApolloFederation::Tracing::KEY) + node = trace[:node_map].add(path) + + # original_field_name is set only for aliased fields + node.original_field_name = field_name if field_name != path.last + node.type = field_type + node.parent_type = parent_type + node.start_time = start_time_nanos - trace[:start_time_nanos] + node.end_time = end_time_nanos - trace[:start_time_nanos] + + raise error if error + + result + end + + # Optional Step 3: + # Overwrite the end times on the trace node if the resolver was lazy. + def self.execute_field_lazy(data, &block) + context = data.fetch(:context) || data.fetch(:query).context + return block.call unless context && context[:tracing_enabled] + + begin + result = block.call + rescue StandardError => e + error = e + end + + end_time_nanos = Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond) + + # interpreter runtime + if data.include?(:context) + context = data.fetch(:context) + path = context.path + else # legacy runtime + context = data.fetch(:query).context + path = data.fetch(:path) + end + + trace = context.namespace(ApolloFederation::Tracing::KEY) + + node = trace[:node_map].node_for_path(path) + node.end_time = end_time_nanos - trace[:start_time_nanos] + + raise error if error + + result + end + # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity + end + end +end diff --git a/spec/apollo-federation/tracing/node_map_spec.rb b/spec/apollo-federation/tracing/node_map_spec.rb new file mode 100644 index 000000000..9def9da4a --- /dev/null +++ b/spec/apollo-federation/tracing/node_map_spec.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'apollo-federation/tracing/node_map' + +RSpec.describe ApolloFederation::Tracing::NodeMap do + it 'creates parents for any deeply nested paths' do + map = described_class.new + map.add(['_entities', 0, 'reviews', 0, 'author', 'name']) + expect(map.nodes.keys).to eq [ + '', + '_entities.0.reviews.0.author.name', + '_entities.0.reviews.0.author', + '_entities.0.reviews.0', + '_entities.0.reviews', + '_entities.0', + '_entities', + ] + end + + it 'correctly assigns the node id for index paths' do + map = described_class.new + map.add(['_entities', 4, 'reviews']) + expect(map.node_for_path('_entities.4').index).to eq 4 + end + + it 'correctly assigns the node id for named paths' do + map = described_class.new + map.add(['_entities', 4, 'reviews']) + expect(map.node_for_path('_entities.4.reviews').response_name).to eq 'reviews' + end + + it 'adds errors by path' do + map = described_class.new + map.add(['_entities', 4, 'reviews']) + + error = { + 'path' => ['_entities', 4, 'reviews'], + 'message' => 'whoops', + 'locations' => ['line' => 2, 'column' => 4], + } + map.add_error(error) + + expect(map.node_for_path('_entities.4.reviews').error).to eq [ + ApolloFederation::Tracing::Error.new( + json: JSON.dump(error), + location: [ApolloFederation::Tracing::Location.new(line: 2, column: 4)], + message: 'whoops', + ), + ] + end + + it 'adds errors to the root object when no node is found' do + map = described_class.new + + error = { + 'path' => ['_entities', 4, 'reviews'], + 'message' => 'whoops', + 'locations' => ['line' => 2, 'column' => 4], + } + map.add_error(error) + + expect(map.root.error).to eq [ + ApolloFederation::Tracing::Error.new( + json: JSON.dump(error), + location: [ApolloFederation::Tracing::Location.new(line: 2, column: 4)], + message: 'whoops', + ), + ] + end +end diff --git a/spec/apollo-federation/tracing_spec.rb b/spec/apollo-federation/tracing_spec.rb new file mode 100644 index 000000000..8a8e8b271 --- /dev/null +++ b/spec/apollo-federation/tracing_spec.rb @@ -0,0 +1,258 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'apollo-federation' + +RSpec.describe ApolloFederation::Tracing do + let(:base_schema) do + Class.new(GraphQL::Schema) do + use ApolloFederation::Tracing + end + end + + # configure clocks to increment by 1 for each call + before do + t = Time.new(2019, 8, 4, 12, 0, 0, '+00:00') + allow(Time).to receive(:now) { t += 1 } + + # nanos are used for durations and offsets, so you'll never see 42, 43, ... + # instead, you'll see the difference from the first call (the start time) + # which will be 1, 2, 3 ... + ns = 42 + allow(Process).to receive(:clock_gettime) + .with(Process::CLOCK_MONOTONIC, :nanosecond) { ns += 1 } + end + + describe 'respecting options on context' do + let(:schema) do + query_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Query' + + field :test, String, null: false + + def test + 'hello world' + end + end + + Class.new(base_schema) do + query query_obj + end + end + + it 'does not add tracing extension by default' do + result = schema.execute('{ test }') + described_class.attach_trace_to_result(result) + expect(result[:extensions]).to be_nil + end + + it 'adds the extensions.ftv1 when the context has tracing_enabled: true' do + result = schema.execute('{ test }', context: { tracing_enabled: true }) + described_class.attach_trace_to_result(result) + expect(result[:extensions][:ftv1]).not_to be_nil + end + + it 'adds debugging info when the context has debug_tracing: true' do + result = schema.execute('{ test }', context: { tracing_enabled: true, debug_tracing: true }) + described_class.attach_trace_to_result(result) + expect(result[:extensions][:ftv1_debug]).not_to be_nil + end + end + + describe 'building the trace tree' do + let(:schema) do + grandchild_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Grandchild' + + field :id, String, null: false + end + + child_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Child' + + field :id, String, null: false + field :grandchild, grandchild_obj, null: false + + def grandchild + { id: 'grandchild' } + end + end + + parent_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Parent' + + field :id, String, null: false + field :child, child_obj, null: false + + def child + { id: 'child' } + end + end + + query_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Query' + + field :parent, parent_obj, null: false + + def parent + { id: 'parent' } + end + end + + Class.new(base_schema) do + query query_obj + end + end + + it 'records timing for children' do + result = schema.execute( + '{ parent { id, child { id, grandchild { id } } } }', + context: { tracing_enabled: true }, + ) + described_class.attach_trace_to_result(result) + + trace = ApolloFederation::Tracing::Trace.decode(Base64.decode64(result[:extensions][:ftv1])) + expect(trace).to eq(ApolloFederation::Tracing::Trace.new( + start_time: { seconds: 1_564_920_001, nanos: 0 }, + end_time: { seconds: 1_564_920_002, nanos: 0 }, + duration_ns: 13, + root: { + child: [{ + response_name: 'parent', + type: 'Parent!', + start_time: 1, + end_time: 2, + parent_type: 'Query', + child: [{ + response_name: 'id', + type: 'String!', + start_time: 3, + end_time: 4, + parent_type: 'Parent', + }, { + response_name: 'child', + type: 'Child!', + start_time: 5, + end_time: 6, + parent_type: 'Parent', + child: [{ + response_name: 'id', + type: 'String!', + start_time: 7, + end_time: 8, + parent_type: 'Child', + }, { + response_name: 'grandchild', + type: 'Grandchild!', + start_time: 9, + end_time: 10, + parent_type: 'Child', + child: [{ + response_name: 'id', + type: 'String!', + start_time: 11, + end_time: 12, + parent_type: 'Grandchild', + }], + },], + },], + }], + }, + )) + end + end + + describe 'indices and errors' do + let(:schema) do + item_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Item' + + field :id, String, null: false + field :name, String, null: false + + def name + raise GraphQL::ExecutionError, "Can't continue with this query" if object[:id] == '2' + + "Item #{object[:id]}" + end + end + + query_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Query' + + field :items, [item_obj], null: false + + def items + [{ id: '1' }, { id: '2' }] + end + end + + Class.new(base_schema) do + query query_obj + end + end + + let(:trace) do + result = schema.execute('{ items { id, name } }', context: { tracing_enabled: true }) + described_class.attach_trace_to_result(result) + ApolloFederation::Tracing::Trace.decode(Base64.decode64(result[:extensions][:ftv1])) + end + + it 'records index instead of response_name for items in arrays' do + expect(trace).to eq ApolloFederation::Tracing::Trace.new( + start_time: { seconds: 1_564_920_001, nanos: 0 }, + end_time: { seconds: 1_564_920_002, nanos: 0 }, + duration_ns: 11, + root: { + child: [{ + response_name: 'items', + type: '[Item!]!', + start_time: 1, + end_time: 2, + parent_type: 'Query', + child: [ + { + index: 0, + child: [{ + response_name: 'id', + type: 'String!', + start_time: 3, + end_time: 4, + parent_type: 'Item', + }, { + response_name: 'name', + type: 'String!', + start_time: 5, + end_time: 6, + parent_type: 'Item', + },], + }, + { + index: 1, + child: [{ + response_name: 'id', + type: 'String!', + start_time: 7, + end_time: 8, + parent_type: 'Item', + }, { + response_name: 'name', + type: 'String!', + start_time: 9, + end_time: 10, + parent_type: 'Item', + error: [{ + message: "Can't continue with this query", + location: [{ line: 1, column: 15 }], + json: %({"message":"Can't continue with this query",\ +"locations":[{"line":1,"column":15}],"path":["items",1,"name"]}), + }], + },], + }, + ], + }], + }, + ) + end + end +end From 125e1affbdbad7041b2ee770960430cde93a24bd Mon Sep 17 00:00:00 2001 From: Lenny Burdette Date: Tue, 20 Aug 2019 09:26:38 -0700 Subject: [PATCH 2/3] chore: code review * `module_function` vs repetitive `self.` * fix a call to `result.to_h` in `.attach_trace_to_result` --- lib/apollo-federation/tracing.rb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/apollo-federation/tracing.rb b/lib/apollo-federation/tracing.rb index 2c5c730f4..b4b337029 100644 --- a/lib/apollo-federation/tracing.rb +++ b/lib/apollo-federation/tracing.rb @@ -3,16 +3,19 @@ module ApolloFederation module Tracing KEY = :ftv1 + DEBUG_KEY = "#{KEY}_debug".to_sym - def self.use(schema) + module_function + + def use(schema) schema.tracer ApolloFederation::Tracing::Tracer end - def self.should_add_traces(headers) + def should_add_traces(headers) headers && headers['apollo-federation-include-trace'] == KEY.to_s end - def self.attach_trace_to_result(result) + def attach_trace_to_result(result) return result unless result.context[:tracing_enabled] trace = result.context.namespace(KEY) @@ -32,18 +35,17 @@ def self.attach_trace_to_result(result) root: trace[:node_map].root, ) - json = result.to_h result[:extensions] ||= {} result[:extensions][KEY] = Base64.encode64(proto.class.encode(proto)) if result.context[:debug_tracing] - result[:extensions]["#{KEY}_debug".to_sym] = proto.to_h + result[:extensions][DEBUG_KEY] = proto.to_h end - json + result.to_h end - def self.to_proto_timestamp(time) + def to_proto_timestamp(time) Google::Protobuf::Timestamp.new(seconds: time.to_i, nanos: time.nsec) end end From dd70bf48a408f15f44e73f86ebe78d8c919f19e8 Mon Sep 17 00:00:00 2001 From: Lenny Burdette Date: Fri, 6 Sep 2019 22:45:06 -0700 Subject: [PATCH 3/3] FIXUP: add comments explaining the tracer execution order and add a test to cover lazy fields will squash later --- Gemfile.lock | 5 ++- lib/apollo-federation/tracing/tracer.rb | 31 +++++++++++++++ spec/apollo-federation/tracing_spec.rb | 52 +++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 436277886..1e5477141 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -2,6 +2,7 @@ PATH remote: . specs: apollo-federation (0.3.2) + google-protobuf graphql GEM @@ -33,8 +34,8 @@ GEM crass (1.0.4) diff-lcs (1.3) erubi (1.8.0) - graphql (1.9.11) - google-protobuf (3.9.0) + google-protobuf (3.9.1) + graphql (1.9.12) i18n (1.6.0) concurrent-ruby (~> 1.0) jaro_winkler (1.5.3) diff --git a/lib/apollo-federation/tracing/tracer.rb b/lib/apollo-federation/tracing/tracer.rb index 41a81ca9d..7934e0758 100644 --- a/lib/apollo-federation/tracing/tracer.rb +++ b/lib/apollo-federation/tracing/tracer.rb @@ -1,5 +1,36 @@ # frozen_string_literal: true +# Trace events are nested and fire in this order +# for a simple single-field query like `{ foo }`: +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# # `execute_field_lazy` fires *only* when the field is lazy +# # (https://graphql-ruby.org/schema/lazy_execution.html) +# # so if it fires we should overwrite the ending times recorded +# # in `execute_field` to capture the total execution time. +# +# +# +# +# +# # `execute_query_lazy` *always* fires, so it's a +# # safe place to capture ending times of the full query. +# +# + module ApolloFederation module Tracing module Tracer diff --git a/spec/apollo-federation/tracing_spec.rb b/spec/apollo-federation/tracing_spec.rb index 8a8e8b271..02ff8c732 100644 --- a/spec/apollo-federation/tracing_spec.rb +++ b/spec/apollo-federation/tracing_spec.rb @@ -162,6 +162,58 @@ def parent end end + class Lazy + def lazy_method + 'lazy_value' + end + end + + describe 'lazy values' do + let(:schema) do + query_obj = Class.new(GraphQL::Schema::Object) do + graphql_name 'Query' + + field :test, String, null: false + + def test + Lazy.new + end + end + + Class.new(base_schema) do + query query_obj + lazy_resolve(Lazy, :lazy_method) + end + end + + let(:trace) do + result = schema.execute('{ test }', context: { tracing_enabled: true }) + described_class.attach_trace_to_result(result) + ApolloFederation::Tracing::Trace.decode(Base64.decode64(result[:extensions][:ftv1])) + end + + it 'works with lazy values' do + expect(trace).to eq ApolloFederation::Tracing::Trace.new( + start_time: { seconds: 1_564_920_001, nanos: 0 }, + end_time: { seconds: 1_564_920_002, nanos: 0 }, + duration_ns: 4, + root: { + child: [{ + response_name: 'test', + type: 'String!', + start_time: 1, + # This is the only discrepancy between a normal field and a lazy field. + # The fake clock incremented once at the end of the `execute_field` step, + # and again at the end of the `execute_field_lazy` step, so we record the + # end time as being two nanoseconds after the start time instead of one. + end_time: 3, + parent_type: 'Query', + }], + }, + ) + end + end + describe 'indices and errors' do let(:schema) do item_obj = Class.new(GraphQL::Schema::Object) do