Skip to content

Commit

Permalink
Add possibility to split oversized udp batches
Browse files Browse the repository at this point in the history
If we use the BatchExportSpanProcessor combined with the
JaegerSpanExporter and use instrumentations that add a lot of metadata
to the spans like sqlalchemy, then we run occationally into the "Data
exceeds the max UDP packet size" warning causing dropped spans and
incomplete data.

The option to reduce the general batch-size to a very small number (in
my case >30) may cause a performance issue as the worker thread of the
batch exporter gets very busy. Instead this change allows the user to
ask the exporter to split oversized batches when they get detected and
send the splits separately instead of dropping them. Depending on the
usecase this is a better option than reducing the batch-size to a very
small value because every now and then they contain a couple of large
spans.
  • Loading branch information
janLo committed Feb 26, 2021
1 parent 99128b3 commit 1e654bd
Show file tree
Hide file tree
Showing 3 changed files with 328 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased](https://github.com/open-telemetry/opentelemetry-python/compare/v0.18b0...HEAD)

### Added
- Add `udp_split_oversized_batches` support to jaeger exporter
([#1500](https://github.com/open-telemetry/opentelemetry-python/pull/1500))

## [0.18b0](https://github.com/open-telemetry/opentelemetry-python/releases/tag/v0.18b0) - 2021-02-16

### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
# pylint: disable=protected-access

import logging
import socket
from os import environ
from typing import Optional

Expand Down Expand Up @@ -120,6 +121,7 @@ class JaegerSpanExporter(SpanExporter):
insecure: True if collector has no encryption or authentication
credentials: Credentials for server authentication.
transport_format: Transport format for exporting spans to collector.
udp_split_oversized_batches: Re-emit oversized batches in smaller chunks.
"""

def __init__(
Expand All @@ -133,6 +135,7 @@ def __init__(
insecure: Optional[bool] = None,
credentials: Optional[ChannelCredentials] = None,
transport_format: Optional[str] = None,
udp_split_oversized_batches: bool = None,
):
self.service_name = service_name
self.agent_host_name = _parameter_setter(
Expand All @@ -151,8 +154,15 @@ def __init__(
env_variable=environ_agent_port,
default=DEFAULT_AGENT_PORT,
)
self.udp_split_oversized_batches = _parameter_setter(
param=udp_split_oversized_batches,
env_variable=Configuration().OTEL_EXPORTER_JAEGER_AGENT_SPLIT_OVERSIZED_BATCHES,
default=False,
)
self._agent_client = AgentClientUDP(
host_name=self.agent_host_name, port=self.agent_port
host_name=self.agent_host_name,
port=self.agent_port,
udp_split_oversized_batches=self.udp_split_oversized_batches,
)
self.collector_endpoint = _parameter_setter(
param=collector_endpoint,
Expand Down Expand Up @@ -257,3 +267,284 @@ def _parameter_setter(param, env_variable, default):
res = param

return res


def _nsec_to_usec_round(nsec):
"""Round nanoseconds to microseconds"""
return (nsec + 500) // 10 ** 3


def _translate_to_jaeger(spans: Span):
"""Translate the spans to Jaeger format.
Args:
spans: Tuple of spans to convert
"""

jaeger_spans = []

for span in spans:
ctx = span.get_span_context()
trace_id = ctx.trace_id
span_id = ctx.span_id

start_time_us = _nsec_to_usec_round(span.start_time)
duration_us = _nsec_to_usec_round(span.end_time - span.start_time)

status = span.status

parent_id = span.parent.span_id if span.parent else 0

tags = _extract_tags(span.attributes)
tags.extend(_extract_tags(span.resource.attributes))

tags.extend(
[
_get_long_tag("status.code", status.status_code.value),
_get_string_tag("status.message", status.description),
_get_string_tag("span.kind", OTLP_JAEGER_SPAN_KIND[span.kind]),
]
)

if span.instrumentation_info is not None:
tags.extend(
[
_get_string_tag(
"otel.instrumentation_library.name",
span.instrumentation_info.name,
),
_get_string_tag(
"otel.instrumentation_library.version",
span.instrumentation_info.version,
),
]
)

# Ensure that if Status.Code is not OK, that we set the "error" tag on the Jaeger span.
if not status.is_ok:
tags.append(_get_bool_tag("error", True))

refs = _extract_refs_from_span(span)
logs = _extract_logs_from_span(span)

flags = int(ctx.trace_flags)

jaeger_span = jaeger.Span(
traceIdHigh=_get_trace_id_high(trace_id),
traceIdLow=_get_trace_id_low(trace_id),
# generated code expects i64
spanId=_convert_int_to_i64(span_id),
operationName=span.name,
startTime=start_time_us,
duration=duration_us,
tags=tags,
logs=logs,
references=refs,
flags=flags,
parentSpanId=_convert_int_to_i64(parent_id),
)

jaeger_spans.append(jaeger_span)

return jaeger_spans


def _extract_refs_from_span(span):
if not span.links:
return None

refs = []
for link in span.links:
trace_id = link.context.trace_id
span_id = link.context.span_id
refs.append(
jaeger.SpanRef(
refType=jaeger.SpanRefType.FOLLOWS_FROM,
traceIdHigh=_get_trace_id_high(trace_id),
traceIdLow=_get_trace_id_low(trace_id),
spanId=_convert_int_to_i64(span_id),
)
)
return refs


def _convert_int_to_i64(val):
"""Convert integer to signed int64 (i64)"""
if val > 0x7FFFFFFFFFFFFFFF:
val -= 0x10000000000000000
return val


def _get_trace_id_low(trace_id):
return _convert_int_to_i64(trace_id & 0xFFFFFFFFFFFFFFFF)


def _get_trace_id_high(trace_id):
return _convert_int_to_i64((trace_id >> 64) & 0xFFFFFFFFFFFFFFFF)


def _extract_logs_from_span(span):
if not span.events:
return None

logs = []

for event in span.events:
fields = _extract_tags(event.attributes)

fields.append(
jaeger.Tag(
key="message", vType=jaeger.TagType.STRING, vStr=event.name
)
)

event_timestamp_us = _nsec_to_usec_round(event.timestamp)
logs.append(
jaeger.Log(timestamp=int(event_timestamp_us), fields=fields)
)
return logs


def _extract_tags(attr):
if not attr:
return []
tags = []
for attribute_key, attribute_value in attr.items():
tag = _convert_attribute_to_tag(attribute_key, attribute_value)
if tag is None:
continue
tags.append(tag)
return tags


def _convert_attribute_to_tag(key, attr):
"""Convert the attributes to jaeger tags."""
if isinstance(attr, bool):
return jaeger.Tag(key=key, vBool=attr, vType=jaeger.TagType.BOOL)
if isinstance(attr, str):
return jaeger.Tag(key=key, vStr=attr, vType=jaeger.TagType.STRING)
if isinstance(attr, int):
return jaeger.Tag(key=key, vLong=attr, vType=jaeger.TagType.LONG)
if isinstance(attr, float):
return jaeger.Tag(key=key, vDouble=attr, vType=jaeger.TagType.DOUBLE)
if isinstance(attr, tuple):
return jaeger.Tag(key=key, vStr=str(attr), vType=jaeger.TagType.STRING)
logger.warning("Could not serialize attribute %s:%r to tag", key, attr)
return None


def _get_long_tag(key, val):
return jaeger.Tag(key=key, vLong=val, vType=jaeger.TagType.LONG)


def _get_string_tag(key, val):
return jaeger.Tag(key=key, vStr=val, vType=jaeger.TagType.STRING)


def _get_bool_tag(key, val):
return jaeger.Tag(key=key, vBool=val, vType=jaeger.TagType.BOOL)


class AgentClientUDP:
"""Implement a UDP client to agent.
Args:
host_name: The host name of the Jaeger server.
port: The port of the Jaeger server.
max_packet_size: Maximum size of UDP packet.
client: Class for creating new client objects for agencies.
split_oversized_batches: Re-emit oversized batches in smaller chunks.
"""

def __init__(
self,
host_name,
port,
max_packet_size=UDP_PACKET_MAX_LENGTH,
client=agent.Client,
split_oversized_batches=False,
):
self.address = (host_name, port)
self.max_packet_size = max_packet_size
self.buffer = TTransport.TMemoryBuffer()
self.client = client(
iprot=TCompactProtocol.TCompactProtocol(trans=self.buffer)
)
self.split_oversized_batches = split_oversized_batches

def emit(self, batch: jaeger.Batch):
"""
Args:
batch: Object to emit Jaeger spans.
"""

# pylint: disable=protected-access
self.client._seqid = 0
# truncate and reset the position of BytesIO object
self.buffer._buffer.truncate(0)
self.buffer._buffer.seek(0)
self.client.emitBatch(batch)
buff = self.buffer.getvalue()
if len(buff) > self.max_packet_size:
if self.split_oversized_batches and len(batch.spans) > 1:
packets = math.ceil(len(buff) / self.max_packet_size)
div = math.ceil(len(batch.spans) / packets)
for packet in range(packets):
start = packet * div
end = (packet + 1) * div
self.emit(
jaeger.Batch(
process=batch.process, spans=batch.spans[start:end]
)
)
else:
logger.warning(
"Data exceeds the max UDP packet size; size %r, max %r",
len(buff),
self.max_packet_size,
)
return

with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as udp_socket:
udp_socket.sendto(buff, self.address)


class Collector:
"""Submits collected spans to Thrift HTTP server.
Args:
thrift_url: URL of the Jaeger HTTP Thrift.
auth: Auth tuple that contains username and password for Basic Auth.
"""

def __init__(self, thrift_url="", auth=None):
self.thrift_url = thrift_url
self.auth = auth
self.http_transport = THttpClient.THttpClient(
uri_or_host=self.thrift_url
)
self.protocol = TBinaryProtocol.TBinaryProtocol(self.http_transport)

# set basic auth header
if auth is not None:
auth_header = "{}:{}".format(*auth)
decoded = base64.b64encode(auth_header.encode()).decode("ascii")
basic_auth = dict(Authorization="Basic {}".format(decoded))
self.http_transport.setCustomHeaders(basic_auth)

def submit(self, batch: jaeger.Batch):
"""Submits batches to Thrift HTTP Server through Binary Protocol.
Args:
batch: Object to emit Jaeger spans.
"""
batch.write(self.protocol)
self.http_transport.flush()
code = self.http_transport.code
msg = self.http_transport.message
if code >= 300 or code < 200:
logger.error(
"Traces cannot be uploaded; HTTP status code: %s, message: %s",
code,
msg,
)
Original file line number Diff line number Diff line change
Expand Up @@ -465,3 +465,35 @@ def test_agent_client(self):
)

agent_client.emit(batch)

def test_agent_client_split(self):
agent_client = jaeger_exporter.AgentClientUDP(
host_name="localhost",
port=6354,
max_packet_size=250,
split_oversized_batches=True,
)

small_batch = jaeger.Batch(
# pylint: disable=protected-access
spans=jaeger_exporter._translate_to_jaeger((self._test_span,)),
process=jaeger.Process(serviceName="xxx"),
)

with unittest.mock.patch(
"socket.socket.sendto", autospec=True
) as fake_sendto:
agent_client.emit(small_batch)
self.assertEqual(fake_sendto.call_count, 1)

large_batch = jaeger.Batch(
# pylint: disable=protected-access
spans=jaeger_exporter._translate_to_jaeger([self._test_span,] * 2),
process=jaeger.Process(serviceName="xxx"),
)

with unittest.mock.patch(
"socket.socket.sendto", autospec=True
) as fake_sendto:
agent_client.emit(large_batch)
self.assertEqual(fake_sendto.call_count, 2)

0 comments on commit 1e654bd

Please sign in to comment.