Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(backend/datadog): use datadog-api-client-python rather than data… #528

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ prometheus =
datadog =
datadog
retrying==1.3.4
datadog_api_client
dynatrace =
requests
retrying==1.3.4
bigquery =
google-api-python-client
google-cloud-bigquery
Expand Down
150 changes: 96 additions & 54 deletions slo_generator/backends/datadog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,42 +17,63 @@
"""

import logging
import os
import pprint

import datadog

from slo_generator import utils

LOGGER = logging.getLogger(__name__)
logging.getLogger("datadog.api").setLevel(logging.ERROR)
from datadog_api_client.v1 import ApiClient, ApiException, Configuration
from datadog_api_client.v1.api.authentication_api import AuthenticationApi
from datadog_api_client.v1.api.metrics_api import MetricsApi
from datadog_api_client.v1.api.service_level_objectives_api import (
ServiceLevelObjectivesApi,
)

# Configure logging
logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR").upper(), force=True)
logger = logging.getLogger(__name__)


class DatadogClient:
def __init__(self, api_key=None, app_key=None, api_host=None, **kwargs):
configuration = Configuration(
host=api_host,
enable_retry=True,
retry_backoff_factor=2,
max_retries=5,
**kwargs,
)
configuration.api_key["apiKeyAuth"] = api_key
configuration.api_key["appKeyAuth"] = app_key
self.api_client = ApiClient(configuration)
AuthenticationApi(self.api_client).validate()
self.slo_api_client = ServiceLevelObjectivesApi(self.api_client)
self.metrics_api_client = MetricsApi(self.api_client)


class DatadogBackend:
"""Backend for querying metrics from Datadog.

Args:
client (obj, optional): Existing Datadog client to pass.
api_key (str): Datadog API key.
app_key (str): Datadog APP key.
app_host (str): Datadog site.
kwargs (dict): Extra arguments to pass to initialize function.
"""

def __init__(self, client=None, api_key=None, app_key=None, **kwargs):
def __init__(
self, client=None, api_key=None, app_key=None, api_host=None, **kwargs
):
self.client = client
if not self.client:
options = {"api_key": api_key, "app_key": app_key}
options.update(kwargs)
datadog.initialize(**options)
self.client = datadog.api
self.client = DatadogClient(
api_key=api_key, app_key=app_key, api_host=api_host, **kwargs
)

def good_bad_ratio(self, timestamp, window, slo_config):
"""Query SLI value from good and valid queries.

Args:
timestamp (int): UNIX timestamp.
window (int): Window (in seconds).
slo_config (dict): SLO configuration.

Returns:
tuple: Good event count, Bad event count.
"""
Expand All @@ -77,9 +98,9 @@ def good_bad_ratio(self, timestamp, window, slo_config):
operator_suffix,
)

good_event_query = self.client.Metric.query(
start=start,
end=end,
good_event_query = self.client.metrics_api_client.query_metrics(
_from=int(start),
to=int(end),
query=query_good,
)

Expand All @@ -90,9 +111,9 @@ def good_bad_ratio(self, timestamp, window, slo_config):
operator_suffix,
)

event_query = self.client.Metric.query(
start=start,
end=end,
event_query = self.client.metrics_api_client.query_metrics(
_from=int(start),
to=int(end),
query=query,
)

Expand All @@ -101,18 +122,18 @@ def good_bad_ratio(self, timestamp, window, slo_config):
if measurement.get("query_valid"):
event_count = event_count - good_event_count

LOGGER.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}")
logging.debug(
f"Good events: {good_event_count} | " f"Bad events: {event_count}"
)

return good_event_count, event_count

def query_sli(self, timestamp, window, slo_config):
"""Query SLI value directly.

Args:
timestamp (int): UNIX timestamp.
window (int): Window (in seconds).
slo_config (dict): SLO configuration.

Returns:
float: SLI value.
"""
Expand All @@ -121,59 +142,82 @@ def query_sli(self, timestamp, window, slo_config):
end = timestamp
query = measurement["query"]
query = self._fmt_query(query, window)
response = self.client.Metric.query(start=start, end=end, query=query)
LOGGER.debug(f"Result valid: {pprint.pformat(response)}")
response = self.client.metrics_api_client.query_metrics(
_from=int(start), to=int(end), query=query
)
logging.debug(f"Result valid: {pprint.pformat(response)}")
return DatadogBackend.count(response, average=True)

def query_slo(self, timestamp, window, slo_config):
"""Query SLO value from a given Datadog SLO.

Args:
timestamp (int): UNIX timestamp.
window (int): Window (in seconds).
slo_config (dict): SLO configuration.

Returns:
tuple: Good event count, bad event count.
"""
slo_id = slo_config["spec"]["service_level_indicator"]["slo_id"]
from_ts = timestamp - window
if utils.is_debug_enabled():
slo_data = self.client.ServiceLevelObjective.get(id=slo_id)
LOGGER.debug(f"SLO data: {slo_id} | Result: {pprint.pformat(slo_data)}")
data = self.client.ServiceLevelObjective.history(
id=slo_id,
from_ts=from_ts,
to_ts=timestamp,
)

try:
LOGGER.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}")
good_event_count = data["data"]["series"]["numerator"]["sum"]
valid_event_count = data["data"]["series"]["denominator"]["sum"]
bad_event_count = valid_event_count - good_event_count
return (good_event_count, bad_event_count)
except KeyError as exception: # monitor-based SLI
sli_value = data["data"]["overall"]["sli_value"] / 100
LOGGER.debug(exception)
return sli_value
# Retrieve the SLO history
data = self.client.slo_api_client.get_slo_history(
slo_id, from_ts=int(from_ts), to_ts=int(timestamp)
)
logging.info(f"SLO history: {data}")
except ApiException as e:
logging.error(f"Error retrieving SLO history: {e}")
return None, None

# Check if the data is present and properly structured
try:
logging.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}")

# Check if necessary keys exist before accessing them
good_event_count = (
data.get("data", {})
.get("series", {})
.get("numerator", {})
.get("sum", 0)
)
valid_event_count = (
data.get("data", {})
.get("series", {})
.get("denominator", {})
.get("sum", 0)
)

if good_event_count is not None and valid_event_count is not None:
bad_event_count = valid_event_count - good_event_count
return good_event_count, bad_event_count

except KeyError as exception: # Monitor-based SLI case
logging.debug(f"KeyError exception: {exception}")
# Retrieve the SLI value if it's a monitor-based SLI
sli_value = (
data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100
)
return (
sli_value,
None,
) # Return None for bad_event_count if it's not a standard SLO

# If the data is invalid or there's an issue, return None for both counts
return None, None

@staticmethod
def _fmt_query(query, window, operator=None, operator_suffix=None):
"""Format Datadog query:

* If the Datadog expression has a `[window]` placeholder, replace it by
the current window. Otherwise, append it to the expression.

* If prefix / suffix operators are defined, apply them to the metric.

* If labels are defined, append them to existing labels.

Args:
query (str): Original query in YAML config.
window (int): Query window (in seconds).
operator (str): Operator (e.g: sum, avg, median, ...)
operator_suffix (str): Operator suffix (e.g: as_count(), ...)

Returns:
str: Formatted query.
"""
Expand All @@ -184,25 +228,23 @@ def _fmt_query(query, window, operator=None, operator_suffix=None):
query = query.replace("[window]", f"{window}")
if operator_suffix:
query = f"{query}.{operator_suffix}"
LOGGER.debug(f"Query: {query}")
logging.debug(f"Query: {query}")
return query

@staticmethod
def count(response, average=False):
"""Count events in time series.

Args:
response (dict): Datadog Metrics API response.
average (bool): Take average of result.

Returns:
int: Event count.
"""
try:
values = []
pointlist = response["series"][0]["pointlist"]
for point in pointlist:
value = point[1]
value = point["value"][1]
if value is None:
continue
values.append(value)
Expand All @@ -212,5 +254,5 @@ def count(response, average=False):
return sum(values) / len(values)
return sum(values)
except (IndexError, AttributeError) as exception:
LOGGER.debug(exception)
logging.debug(exception)
return 0 # no events in timeseries
Loading