Skip to content

Commit

Permalink
Don't create Elasticsearch span names containing document IDs (#705)
Browse files Browse the repository at this point in the history
* Fix typo: _DEFALT_OP_NAME

* Extract ES document ID from URL, put in attributes

Elasticsearch creates URLs for index() and delete() before they hit
perform_request(). This means there would be many unique span names
containing unique document IDs, of the form
'Elasticsearch/indexname/_doc/documentid'.

This extracts the document ID from the URL and replaces it with ':id',
then puts it in the span's attributes.

* Add TODO comment with link to issue

* Add CHANGELOG entry

* Don't use custom doc types, deprecated in ES 7

* Update tests to match instrumentation
  • Loading branch information
remram44 authored Oct 12, 2021
1 parent 4046fdb commit 1b75672
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 20 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
([679](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/679))
- `opentelemetry-exporter-richconsole` Initial release
([#686](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/686))
- `opentelemetry-instrumentation-elasticsearch` no longer creates unique span names by including document IDs, replaces them with `:id` and puts the value in attribute `elasticsearch.id`
([#705](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/705))
- `opentelemetry-instrumentation-tornado` now sets `http.client_ip` and `tornado.handler` attributes
([#706](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/706))
- `opentelemetry-instrumentation-requests` added exclude urls functionality
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def response_hook(span, response):
es.get(index='my-index', doc_type='my-type', id=1)
"""

import re
from logging import getLogger
from os import environ
from typing import Collection
Expand All @@ -107,7 +108,7 @@ def response_hook(span, response):
"took",
]

_DEFALT_OP_NAME = "request"
_DEFAULT_OP_NAME = "request"


class ElasticsearchInstrumentor(BaseInstrumentor):
Expand Down Expand Up @@ -146,6 +147,9 @@ def _uninstrument(self, **kwargs):
unwrap(elasticsearch.Transport, "perform_request")


_regex_doc_url = re.compile(r"/_doc/([^/]+)")


def _wrap_perform_request(
tracer, span_name_prefix, request_hook=None, response_hook=None
):
Expand All @@ -161,7 +165,24 @@ def wrapper(wrapped, _, args, kwargs):
len(args),
)

op_name = span_name_prefix + (url or method or _DEFALT_OP_NAME)
op_name = span_name_prefix + (url or method or _DEFAULT_OP_NAME)
doc_id = None
if url:
# TODO: This regex-based solution avoids creating an unbounded number of span names, but should be replaced by instrumenting individual Elasticsearch methods instead of Transport.perform_request()
# A limitation of the regex is that only the '_doc' mapping type is supported. Mapping types are deprecated since Elasticsearch 7
# https://github.com/open-telemetry/opentelemetry-python-contrib/issues/708
match = _regex_doc_url.search(url)
if match is not None:
# Remove the full document ID from the URL
doc_span = match.span()
op_name = (
span_name_prefix
+ url[: doc_span[0]]
+ "/_doc/:id"
+ url[doc_span[1] :]
)
# Put the document ID in attributes
doc_id = match.group(1)
params = kwargs.get("params", {})
body = kwargs.get("body", None)

Expand All @@ -184,6 +205,8 @@ def wrapper(wrapped, _, args, kwargs):
attributes[SpanAttributes.DB_STATEMENT] = str(body)
if params:
attributes["elasticsearch.params"] = str(params)
if doc_id:
attributes["elasticsearch.id"] = doc_id
for key, value in attributes.items():
span.set_attribute(key, value)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_instrumentor(self, request_mock):
request_mock.return_value = (1, {}, {})

es = Elasticsearch()
es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})

spans_list = self.get_finished_spans()
self.assertEqual(len(spans_list), 1)
Expand All @@ -79,7 +79,7 @@ def test_instrumentor(self, request_mock):
# check that no spans are generated after uninstrument
ElasticsearchInstrumentor().uninstrument()

es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})

spans_list = self.get_finished_spans()
self.assertEqual(len(spans_list), 1)
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_prefix_env(self, request_mock):

def _test_prefix(self, prefix):
es = Elasticsearch()
es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})

spans_list = self.get_finished_spans()
self.assertEqual(len(spans_list), 1)
Expand All @@ -133,11 +133,12 @@ def test_result_values(self, request_mock):
'{"found": false, "timed_out": true, "took": 7}',
)
es = Elasticsearch()
es.get(index="test-index", doc_type="tweet", id=1)
es.get(index="test-index", doc_type="_doc", id=1)

spans = self.get_finished_spans()

self.assertEqual(1, len(spans))
self.assertEqual(spans[0].name, "Elasticsearch/test-index/_doc/:id")
self.assertEqual("False", spans[0].attributes["elasticsearch.found"])
self.assertEqual(
"True", spans[0].attributes["elasticsearch.timed_out"]
Expand All @@ -159,7 +160,7 @@ def test_trace_error_not_found(self, request_mock):
def _test_trace_error(self, code, exc):
es = Elasticsearch()
try:
es.get(index="test-index", doc_type="tweet", id=1)
es.get(index="test-index", doc_type="_doc", id=1)
except Exception: # pylint: disable=broad-except
pass

Expand All @@ -176,15 +177,13 @@ def test_parent(self, request_mock):
request_mock.return_value = (1, {}, {})
es = Elasticsearch()
with self.tracer.start_as_current_span("parent"):
es.index(
index="sw", doc_type="people", id=1, body={"name": "adam"}
)
es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})

spans = self.get_finished_spans()
self.assertEqual(len(spans), 2)

parent = spans.by_name("parent")
child = spans.by_name("Elasticsearch/sw/people/1")
child = spans.by_name("Elasticsearch/sw/_doc/:id")
self.assertIsNotNone(child.parent)
self.assertEqual(child.parent.span_id, parent.context.span_id)

Expand All @@ -198,13 +197,13 @@ def test_multithread(self, request_mock):
# 3. Check the spans got different parents, and are in the expected order.
def target1(parent_span):
with trace.use_span(parent_span):
es.get(index="test-index", doc_type="tweet", id=1)
es.get(index="test-index", doc_type="_doc", id=1)
ev.set()
ev.wait()

def target2():
ev.wait()
es.get(index="test-index", doc_type="tweet", id=2)
es.get(index="test-index", doc_type="_doc", id=2)
ev.set()

with self.tracer.start_as_current_span("parent") as span:
Expand All @@ -220,8 +219,8 @@ def target2():
self.assertEqual(3, len(spans))

s1 = spans.by_name("parent")
s2 = spans.by_name("Elasticsearch/test-index/tweet/1")
s3 = spans.by_name("Elasticsearch/test-index/tweet/2")
s2 = spans.by_attr("elasticsearch.id", "1")
s3 = spans.by_attr("elasticsearch.id", "2")

self.assertIsNotNone(s2.parent)
self.assertEqual(s2.parent.span_id, s1.context.span_id)
Expand Down Expand Up @@ -343,10 +342,9 @@ def request_hook(span, method, url, kwargs):
)
es = Elasticsearch()
index = "test-index"
doc_type = "tweet"
doc_id = 1
kwargs = {"params": {"test": True}}
es.get(index=index, doc_type=doc_type, id=doc_id, **kwargs)
es.get(index=index, doc_type="_doc", id=doc_id, **kwargs)

spans = self.get_finished_spans()

Expand All @@ -355,7 +353,7 @@ def request_hook(span, method, url, kwargs):
"GET", spans[0].attributes[request_hook_method_attribute]
)
self.assertEqual(
f"/{index}/{doc_type}/{doc_id}",
f"/{index}/_doc/{doc_id}",
spans[0].attributes[request_hook_url_attribute],
)
self.assertEqual(
Expand Down Expand Up @@ -390,7 +388,7 @@ def response_hook(span, response):
"hits": [
{
"_index": "test-index",
"_type": "tweet",
"_type": "_doc",
"_id": "1",
"_score": 0.18232156,
"_source": {"name": "tester"},
Expand All @@ -405,7 +403,7 @@ def response_hook(span, response):
json.dumps(response_payload),
)
es = Elasticsearch()
es.get(index="test-index", doc_type="tweet", id=1)
es.get(index="test-index", doc_type="_doc", id=1)

spans = self.get_finished_spans()

Expand Down

0 comments on commit 1b75672

Please sign in to comment.