diff --git a/.gitignore b/.gitignore index b7dd28ef..383b1975 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ reports/ .mypy_cache .pytest_cache/ .pytype/ +.pip-wheel-metadata/ diff --git a/Dockerfile b/Dockerfile index d6669577..1e8bd955 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,6 @@ RUN apt-get update && \ ADD . /app WORKDIR /app RUN pip install -U setuptools -RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]" +RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]" ENTRYPOINT [ "slo-generator" ] CMD ["-v"] diff --git a/Makefile b/Makefile index 59e4178c..34ac94d1 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ develop: install pre-commit install install: clean - $(PIP) install -e ."[api, datadog, prometheus, elasticsearch, pubsub, cloud_monitoring, bigquery, dev]" + $(PIP) install -e ."[api, datadog, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, bigquery, dev]" uninstall: clean $(PIP) freeze --exclude-editable | xargs $(PIP) uninstall -y @@ -102,7 +102,7 @@ bandit: safety: safety check -integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom +integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom int_sp int_cm: slo-generator compute -f samples/cloud_monitoring -c samples/config.yaml @@ -125,6 +125,9 @@ int_es: int_prom: slo-generator compute -f samples/prometheus -c samples/config.yaml +int_sp: + slo-generator compute -f samples/splunk -c samples/config.yaml + # Run API locally run_api: slo-generator api --target=run_compute --signature-type=http -c samples/config.yaml diff --git a/docs/providers/splunk.md b/docs/providers/splunk.md new file mode 100644 index 00000000..40311bbf --- /dev/null +++ b/docs/providers/splunk.md @@ -0,0 +1,71 @@ +# Splunk + +## Backend + +Using the `splunk` backend class, you can query any metrics available in Splunk Enterprise to create an SLO. + +```yaml +backends: + splunk: + host: ${SPLUNK_HOST} + port: ${SPLUNK_PORT} + user: ${SPLUNK_USER} + password: ${SPLUNK_PWD} + token: $[SPLUNK_BEARER_TOKEN} +``` +You need either a user/password pair or a token, not both. + +The following methods are available to compute SLOs with the `splunk` backend: + +* `search_query_good` & `search_query_bad`/`search_query_valid` for computing good / bad metrics ratios. +* `search_query` for computing SLIs directly with Splunk. + +The `good_bad_ratio` method is used to compute the ratio between two metrics: + +* **Good events**, i.e events we consider as 'good' from the user perspective. +* **Bad or valid events**, i.e events we consider either as 'bad' from the user perspective, or all events we consider as 'valid' for the computation of the SLO. Note : if both are specified, 'bad' configuration takes precedence over 'valid'. + +This method is often used for availability SLOs, but can be used for other purposes as well (see examples). + +**Config example:** + +```yaml +backend: splunk +method: good_bad_ratio +service_level_indicator: + search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good | table good + search_query_bad: search index=access_logs host=web* status!=200 status!=403 | stats count(status) as bad | table bad +``` + +**→ [Full SLO config](../../samples/splunk/slo_splunk_app_availability_ratio.yaml)** + +### Query SLI + +The `query_sli` method is used to directly query the needed SLI with Splunk using the search language arithmetics ciapabilities. + +This method makes it more flexible to input any `splunk` SLI computation and eventually reduces the number of queries made to Splunk. + +```yaml +backend: splunk +method: query_sli +service_level_indicator: + search_query: search index=access_logs host=web* status!=200 status!=403 | stats count(status="200") as good count(status!="403") as valid | eval sli=round(good/valid,3) +``` + +**→ [Full SLO config](../../samples/splunk/slo_splunk_app_availability_query_sli.yaml)** + +### Examples + +Complete SLO samples using `splunk` are available in [samples/splunk](../../samples/splunk). Check them out! + +## Exporter + +Not implemented as of yet. + +## Splunk search performance + +Note that running oneshot queries on splunk may not always be fast. Depending on the resources of your splunk infrastructure, volume of data and SLO window, it can take up to minutes. It can even be so long that the "oneshot" method of the SDK we're using times out. In this case there are several alternatives: + +1. Switch the code to the "normal" search mode instead, which asynchronously polls the splunk search head for results instead of waiting for the REST response. +2. Make use of pre-cooked "saved searches" and just trigger the jobs on demand. This would require the bakend code to be reworked to switch from oneshot searches to saved search +3. Alternatively it's also possible to have isaved searches already executed by splunk on a schedule and just query their results. Same here, this would require a rework/update of the code diff --git a/samples/config.yaml b/samples/config.yaml index 5978cdad..deabe06a 100644 --- a/samples/config.yaml +++ b/samples/config.yaml @@ -19,6 +19,11 @@ backends: url: ${ELASTICSEARCH_URL} prometheus: url: ${PROMETHEUS_URL} + splunk: + host: ${SPLUNK_HOST} + port: ${SPLUNK_PORT} + user: ${SPLUNK_USER} + password: ${SPLUNK_PWD} exporters: cloudevent: diff --git a/samples/splunk/slo_splunk_app_availability_query_sli.yaml b/samples/splunk/slo_splunk_app_availability_query_sli.yaml new file mode 100644 index 00000000..28bf85b0 --- /dev/null +++ b/samples/splunk/slo_splunk_app_availability_query_sli.yaml @@ -0,0 +1,16 @@ +apiVersion: sre.google.com/v2 +kind: ServiceLevelObjective +metadata: + name: splunk-app-availability + labels: + service_name: splunk + feature_name: app + slo_name: availability +spec: + description: 99% of app requests return a valid HTTP code + backend: splunk + method: query_sli + exporters: [] + service_level_indicator: + search_query: search index=access_logs host=web* status!=200 status!=403 | stats count(status="200") as good count(status!="403") as valid | eval sli=round(good/valid,3) + goal: 0.99 diff --git a/samples/splunk/slo_splunk_app_availability_ratio.yaml b/samples/splunk/slo_splunk_app_availability_ratio.yaml new file mode 100644 index 00000000..d855afcb --- /dev/null +++ b/samples/splunk/slo_splunk_app_availability_ratio.yaml @@ -0,0 +1,17 @@ +apiVersion: sre.google.com/v2 +kind: ServiceLevelObjective +metadata: + name: splunk-app-availability + labels: + service_name: splunk + feature_name: app + slo_name: availability +spec: + description: 99% of app requests return a valid HTTP code + backend: splunk + method: good_bad_ratio + exporters: [] + service_level_indicator: + search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good + search_query_bad: search index=access_logs host=web* status!=200 status!=403 | stats count(status) as bad + goal: 0.99 diff --git a/samples/splunk/slo_splunk_app_availability_ratio2.yaml b/samples/splunk/slo_splunk_app_availability_ratio2.yaml new file mode 100644 index 00000000..37039be9 --- /dev/null +++ b/samples/splunk/slo_splunk_app_availability_ratio2.yaml @@ -0,0 +1,17 @@ +apiVersion: sre.google.com/v2 +kind: ServiceLevelObjective +metadata: + name: splunk-app-availability + labels: + service_name: splunk + feature_name: app + slo_name: availability +spec: + description: 99% of app requests return a valid HTTP code + backend: splunk + method: good_bad_ratio + exporters: [] + service_level_indicator: + search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good + search_query_valid: search index=access_logs host=web* status!=403 | stats count(status) as valid + goal: 0.99 diff --git a/setup.cfg b/setup.cfg index bc3d5117..176a9268 100644 --- a/setup.cfg +++ b/setup.cfg @@ -89,11 +89,13 @@ cloud_service_monitoring = cloud_storage = google-api-python-client <2 google-cloud-storage +elasticsearch = + elasticsearch +splunk = + splunk-sdk pubsub = google-api-python-client <2 google-cloud-pubsub <2 -elasticsearch = - elasticsearch cloudevent = cloudevents dev = diff --git a/slo_generator/backends/splunk.py b/slo_generator/backends/splunk.py new file mode 100644 index 00000000..deb66183 --- /dev/null +++ b/slo_generator/backends/splunk.py @@ -0,0 +1,160 @@ +""" +`splunk.py` +Query a splunk search to compute a SLI as a custom slo-generator backend +""" +import copy +import json +import logging +import re + +import splunklib.client as splunk_client + +LOGGER = logging.getLogger(__name__) + + +class SplunkBackend: + """ + Queries data from a Splunk instance (On Premises or Cloud) + and make SLO out of them + """ + + def __init__(self, client=None, **splunk_config): + self.client = client + conf = copy.deepcopy(splunk_config) + host = conf.pop("host", None) + port = int(conf.pop("port", 8089)) + token = conf.pop("token", None) + user = conf.pop("user", None) + password = conf.pop("password", None) + if not self.client: + if token is not None: + # Create a Service instance and log in using a token + self.client = splunk_client.connect( + host=host, + port=port, + splunkToken=token, + ) + else: + # Create a Service instance and log in using user/pwd + self.client = splunk_client.connect( + host=host, + port=port, + username=user, + password=password, + ) + + def good_bad_ratio(self, timestamp, window, slo_config): + """ + Query SLI value from good and valid queries. + If both search_query_bad & search_query_valid are supplied, + "bad" takes precedence over valid. + Args: + timestamp (int): UNIX timestamp. + window (int): Window (in seconds). + slo_config (dict): SLO configuration. + spec: + method: "good_bad_ratio" + service_level_indicator: + search_query_good (str): the search query to loook for good events, + must return a single row/column named "good" + search_query_bad (str): the search query to loook for bad events, + must return a single row/column named "bad" + search_query_valid (str): the search query to loook for valid events, + must return a single row/column named "valid" + Returns: + tuple: Good event count, Bad event count. + """ + kwargs_search = { + "earliest_time": f"-{window}s", + "latest_time": timestamp, + "output_mode": "json", + } + result_good = int( + self.splunk_query( + slo_config["spec"]["service_level_indicator"]["search_query_good"], + "good", + **kwargs_search, + ) + ) + if "search_query_bad" in slo_config["spec"]["service_level_indicator"]: + result_bad = int( + self.splunk_query( + slo_config["spec"]["service_level_indicator"]["search_query_bad"], + "bad", + **kwargs_search, + ) + ) + elif "search_query_valid" in slo_config["spec"]["service_level_indicator"]: + result_bad = ( + int( + self.splunk_query( + slo_config["spec"]["service_level_indicator"][ + "search_query_valid" + ], + "valid", + **kwargs_search, + ) + ) + - result_good + ) + + return (result_good, result_bad) + + def query_sli(self, timestamp, window, slo_config): + """Query SLI value directly. + Args: + timestamp (int): UNIX timestamp. + window (int): Window (in seconds). + slo_config (dict): SLO configuration. + spec: + method: "query_sli" + service_level_indicator: + search_query (str): the search query to run. + must return a single row + with at least a column named "sli" + Returns: + float: SLI value. + """ + kwargs_search = { + "earliest_time": f"-{window}s", + "latest_time": timestamp, + "output_mode": "json", + } + result = self.splunk_query( + slo_config["spec"]["service_level_indicator"]["search_query"], + "sli", + **kwargs_search, + ) + return result["sli"] + + @staticmethod + def fix_search_prefix(search=""): + """ + Splunk API search queries must start with "search" + but people are used to the search bar of the UI which doesn't + + Args: + search(string): the search to execute + Returns: + The same string prefixed with "search " if needed + """ + if not re.search("^search ", search): + search = f"search {search}" + return search + + def splunk_query(self, search="", result_column="", **kwargs_search): + """ + Cleanup and sent the search query to splunk + and return the content of the first row of the choosen column + + Args: + search(string): the search string to run against Splunk + result_column(string): the column to look for in the results of the search + kwargs_search(item): search parameters + as described in the Splunk oneshot search API + Returns + The value of the first row of the results for the selected column + """ + search_query = self.fix_search_prefix(search) + result_json = self.client.jobs.oneshot(search_query, **kwargs_search) + return json.loads(str(result_json))["results"][0][result_column] diff --git a/tests/unit/backends/test_splunk.py b/tests/unit/backends/test_splunk.py new file mode 100644 index 00000000..7f94441b --- /dev/null +++ b/tests/unit/backends/test_splunk.py @@ -0,0 +1,28 @@ +# Copyright 2022 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa + +import unittest + +from slo_generator.backends.splunk import SplunkBackend + + +class TestSplunkBackend(unittest.TestCase): + def test_fix_search_prefix(search): + search: str = "index=* status=200" + fixed_search = "search " + search + + assert SplunkBackend.fix_search_prefix(search) == fixed_search + assert SplunkBackend.fix_search_prefix(fixed_search) == fixed_search diff --git a/tests/unit/fixtures/splunk_generic_response.json b/tests/unit/fixtures/splunk_generic_response.json new file mode 100644 index 00000000..a8e0ef26 --- /dev/null +++ b/tests/unit/fixtures/splunk_generic_response.json @@ -0,0 +1,13 @@ +{ + "init_offset" : 0, + "messages" : [], + "preview" : false, + "results": [ + { + "good":"13666", + "bad":"2", + "valid":"13668", + "sli":"99.99" + } + ] +} diff --git a/tests/unit/test_compute.py b/tests/unit/test_compute.py index d17762ae..7a305f70 100644 --- a/tests/unit/test_compute.py +++ b/tests/unit/test_compute.py @@ -20,6 +20,8 @@ from google.auth._default import _CLOUD_SDK_CREDENTIALS_WARNING from mock import MagicMock, patch from prometheus_http_client import Prometheus +from splunklib import client as Splunk +from splunklib.client import Jobs from slo_generator.backends.dynatrace import DynatraceClient from slo_generator.compute import compute, export @@ -40,6 +42,7 @@ mock_es, mock_prom, mock_sd, + mock_splunk_oneshot, mock_ssm_client, ) @@ -53,6 +56,7 @@ SLO_CONFIGS_ES = load_slo_samples("elasticsearch", CTX) SLO_CONFIGS_DD = load_slo_samples("datadog", CTX) SLO_CONFIGS_DT = load_slo_samples("dynatrace", CTX) +SLO_CONFIGS_SPLUNK = load_slo_samples("splunk", CTX) SLO_REPORT = load_fixture("slo_report_v2.json") SLO_REPORT_V1 = load_fixture("slo_report_v1.json") EXPORTERS = load_fixture("exporters.yaml", CTX) @@ -75,6 +79,13 @@ class TestCompute(unittest.TestCase): maxDiff = None + @patch.object(Jobs, "oneshot", side_effect=mock_splunk_oneshot) + @patch.object(Splunk, "connect", return_value=None) + def test_splunk_search(self, *mocks): + for config in SLO_CONFIGS_SPLUNK: + with self.subTest(config=config): + compute(config, CONFIG) + @patch( "google.api_core.grpc_helpers.create_channel", return_value=mock_sd(2 * STEPS * len(SLO_CONFIGS_SD)), diff --git a/tests/unit/test_stubs.py b/tests/unit/test_stubs.py index 768ef192..8e4493f6 100644 --- a/tests/unit/test_stubs.py +++ b/tests/unit/test_stubs.py @@ -56,6 +56,10 @@ "DYNATRACE_API_URL": "fake", "DYNATRACE_API_TOKEN": "fake", "DYNATRACE_SLO_ID": "fake", + "SPLUNK_HOST": "fake", + "SPLUNK_PORT": "8089", + "SPLUNK_USER": "fake", + "SPLUNK_PWD": "fake", } @@ -259,6 +263,12 @@ def mock_dt_errors(*args, **kwargs): return load_fixture("dt_error_rate.json") +# def mock_splunk_connect(**kvargs) +def mock_splunk_oneshot(search): + """Mock Splunk oneshot search job response""" + return load_fixture("splunk_generic_response.json") + + class dotdict(dict): """dot.notation access to dictionary attributes"""