feat: add support for Splunk backend (#335)

* Backend: Add Splunk as a backend provider Signed-off-by: Maxime Véroone <[email protected]> * WIP reveiw --------- Signed-off-by: Maxime Véroone <[email protected]>
google · Jun 21, 2023 · 5171318 · 5171318
1 parent 5542a06
commit 5171318
Show file tree

Hide file tree

Showing 14 changed files with 359 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,4 @@ reports/
 .mypy_cache
 .pytest_cache/
 .pytype/
+.pip-wheel-metadata/
diff --git a/Dockerfile b/Dockerfile
@@ -22,6 +22,6 @@ RUN apt-get update && \
 ADD . /app
 WORKDIR /app
 RUN pip install -U setuptools
-RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]"
+RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]"
 ENTRYPOINT [ "slo-generator" ]
 CMD ["-v"]
diff --git a/Makefile b/Makefile
@@ -58,7 +58,7 @@ develop: install
 	pre-commit install
 
 install: clean
-	$(PIP) install -e ."[api, datadog, prometheus, elasticsearch, pubsub, cloud_monitoring, bigquery, dev]"
+	$(PIP) install -e ."[api, datadog, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, bigquery, dev]"
 
 uninstall: clean
 	$(PIP) freeze --exclude-editable | xargs $(PIP) uninstall -y
@@ -102,7 +102,7 @@ bandit:
 safety:
 	safety check
 
-integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom
+integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom int_sp
 
 int_cm:
 	slo-generator compute -f samples/cloud_monitoring -c samples/config.yaml
@@ -125,6 +125,9 @@ int_es:
 int_prom:
 	slo-generator compute -f samples/prometheus -c samples/config.yaml
 
+int_sp:
+	slo-generator compute -f samples/splunk -c samples/config.yaml
+
 # Run API locally
 run_api:
 	slo-generator api --target=run_compute --signature-type=http -c samples/config.yaml

diff --git a/docs/providers/splunk.md b/docs/providers/splunk.md
@@ -0,0 +1,71 @@
+# Splunk
+
+## Backend
+
+Using the `splunk` backend class, you can query any metrics available in Splunk Enterprise to create an SLO.
+
+```yaml
+backends:
+  splunk:
+    host: ${SPLUNK_HOST}
+    port: ${SPLUNK_PORT}
+    user: ${SPLUNK_USER}
+    password: ${SPLUNK_PWD}
+    token: $[SPLUNK_BEARER_TOKEN}
+```
+You need either a user/password pair or a token, not both.
+
+The following methods are available to compute SLOs with the `splunk` backend:
+
+* `search_query_good` & `search_query_bad`/`search_query_valid` for computing good / bad metrics ratios.
+* `search_query` for computing SLIs directly with Splunk.
+
+The `good_bad_ratio` method is used to compute the ratio between two metrics:
+
+* **Good events**, i.e events we consider as 'good' from the user perspective.
+* **Bad or valid events**, i.e events we consider either as 'bad' from the user perspective, or all events we consider as 'valid' for the computation of the SLO. Note : if both are specified, 'bad' configuration takes precedence over 'valid'.
+
+This method is often used for availability SLOs, but can be used for other purposes as well (see examples).
+
+**Config example:**
+
+```yaml
+backend: splunk
+method:  good_bad_ratio
+service_level_indicator:
+    search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good | table good
+    search_query_bad: search index=access_logs host=web* status!=200 status!=403 | stats count(status) as bad | table bad
+```
+
+**&rightarrow; [Full SLO config](../../samples/splunk/slo_splunk_app_availability_ratio.yaml)**
+
+### Query SLI
+
+The `query_sli` method is used to directly query the needed SLI with Splunk using the search language arithmetics ciapabilities.
+
+This method makes it more flexible to input any `splunk` SLI computation and eventually reduces the number of queries made to Splunk.
+
+```yaml
+backend: splunk
+method: query_sli
+service_level_indicator:
+    search_query: search index=access_logs host=web* status!=200 status!=403 | stats count(status="200") as good count(status!="403") as valid | eval sli=round(good/valid,3)
+```
+
+**&rightarrow; [Full SLO config](../../samples/splunk/slo_splunk_app_availability_query_sli.yaml)**
+
+### Examples
+
+Complete SLO samples using `splunk` are available in [samples/splunk](../../samples/splunk). Check them out!
+
+## Exporter
+
+Not implemented as of yet.
+
+## Splunk search performance
+
+Note that running oneshot queries on splunk may not always be fast. Depending on the resources of your splunk infrastructure, volume of data and SLO window, it can take up to minutes. It can even be so long that the "oneshot" method of the SDK we're using times out. In this case there are several alternatives:
+
+1. Switch the code to the "normal" search mode instead, which asynchronously polls the splunk search head for results instead of waiting for the REST response.
+2. Make use of pre-cooked "saved searches" and just trigger the jobs on demand. This would require the bakend code to be reworked to switch from oneshot searches to saved search
+3. Alternatively it's also possible to have isaved searches already executed by splunk on a schedule and just query their results. Same here, this would require a rework/update of the code
diff --git a/samples/config.yaml b/samples/config.yaml
@@ -19,6 +19,11 @@ backends:
     url: ${ELASTICSEARCH_URL}
   prometheus:
     url: ${PROMETHEUS_URL}
+  splunk:
+    host: ${SPLUNK_HOST}
+    port: ${SPLUNK_PORT}
+    user: ${SPLUNK_USER}
+    password: ${SPLUNK_PWD}
 
 exporters:
   cloudevent:

diff --git a/samples/splunk/slo_splunk_app_availability_query_sli.yaml b/samples/splunk/slo_splunk_app_availability_query_sli.yaml
@@ -0,0 +1,16 @@
+apiVersion: sre.google.com/v2
+kind: ServiceLevelObjective
+metadata:
+    name: splunk-app-availability
+    labels:
+        service_name: splunk
+        feature_name: app
+        slo_name: availability
+spec:
+    description: 99% of app requests return a valid HTTP code
+    backend: splunk
+    method: query_sli
+    exporters: []
+    service_level_indicator:
+        search_query: search index=access_logs host=web* status!=200 status!=403 | stats count(status="200") as good count(status!="403") as valid | eval sli=round(good/valid,3)
+    goal: 0.99
diff --git a/samples/splunk/slo_splunk_app_availability_ratio.yaml b/samples/splunk/slo_splunk_app_availability_ratio.yaml
@@ -0,0 +1,17 @@
+apiVersion: sre.google.com/v2
+kind: ServiceLevelObjective
+metadata:
+    name: splunk-app-availability
+    labels:
+        service_name: splunk
+        feature_name: app
+        slo_name: availability
+spec:
+    description: 99% of app requests return a valid HTTP code
+    backend: splunk
+    method: good_bad_ratio
+    exporters: []
+    service_level_indicator:
+        search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good
+        search_query_bad: search index=access_logs host=web* status!=200 status!=403 | stats count(status) as bad
+    goal: 0.99
diff --git a/samples/splunk/slo_splunk_app_availability_ratio2.yaml b/samples/splunk/slo_splunk_app_availability_ratio2.yaml
@@ -0,0 +1,17 @@
+apiVersion: sre.google.com/v2
+kind: ServiceLevelObjective
+metadata:
+    name: splunk-app-availability
+    labels:
+        service_name: splunk
+        feature_name: app
+        slo_name: availability
+spec:
+    description: 99% of app requests return a valid HTTP code
+    backend: splunk
+    method: good_bad_ratio
+    exporters: []
+    service_level_indicator:
+        search_query_good: search index=access_logs host=web* status=200 | stats count(status) as good
+        search_query_valid: search index=access_logs host=web* status!=403 | stats count(status) as valid
+    goal: 0.99
diff --git a/setup.cfg b/setup.cfg
@@ -89,11 +89,13 @@ cloud_service_monitoring =
 cloud_storage =
     google-api-python-client <2
     google-cloud-storage
+elasticsearch =
+    elasticsearch
+splunk =
+    splunk-sdk
 pubsub =
     google-api-python-client <2
     google-cloud-pubsub <2
-elasticsearch =
-    elasticsearch
 cloudevent =
     cloudevents
 dev =

diff --git a/slo_generator/backends/splunk.py b/slo_generator/backends/splunk.py
@@ -0,0 +1,160 @@
+"""
+`splunk.py`
+Query a splunk search to compute a SLI as a custom slo-generator backend
+"""
+import copy
+import json
+import logging
+import re
+
+import splunklib.client as splunk_client
+
+LOGGER = logging.getLogger(__name__)
+
+
+class SplunkBackend:
+    """
+    Queries data from a Splunk instance (On Premises or Cloud)
+    and make SLO out of them
+    """
+
+    def __init__(self, client=None, **splunk_config):
+        self.client = client
+        conf = copy.deepcopy(splunk_config)
+        host = conf.pop("host", None)
+        port = int(conf.pop("port", 8089))
+        token = conf.pop("token", None)
+        user = conf.pop("user", None)
+        password = conf.pop("password", None)
+        if not self.client:
+            if token is not None:
+                # Create a Service instance and log in using a token
+                self.client = splunk_client.connect(
+                    host=host,
+                    port=port,
+                    splunkToken=token,
+                )
+            else:
+                # Create a Service instance and log in using user/pwd
+                self.client = splunk_client.connect(
+                    host=host,
+                    port=port,
+                    username=user,
+                    password=password,
+                )
+
+    def good_bad_ratio(self, timestamp, window, slo_config):
+        """
+        Query SLI value from good and valid queries.
+        If both search_query_bad & search_query_valid are supplied,
+           "bad" takes precedence over valid.
+        Args:
+            timestamp (int): UNIX timestamp.
+            window (int): Window (in seconds).
+            slo_config (dict): SLO configuration.
+              spec:
+                method: "good_bad_ratio"
+                service_level_indicator:
+                  search_query_good (str): the search query to loook for good events,
+                                           must return a single row/column named "good"
+                  search_query_bad (str): the search query to loook for bad events,
+                                           must return a single row/column named "bad"
+                  search_query_valid (str): the search query to loook for valid events,
+                                           must return a single row/column named "valid"
+        Returns:
+            tuple: Good event count, Bad event count.
+        """
+        kwargs_search = {
+            "earliest_time": f"-{window}s",
+            "latest_time": timestamp,
+            "output_mode": "json",
+        }
+        result_good = int(
+            self.splunk_query(
+                slo_config["spec"]["service_level_indicator"]["search_query_good"],
+                "good",
+                **kwargs_search,
+            )
+        )
+        if "search_query_bad" in slo_config["spec"]["service_level_indicator"]:
+            result_bad = int(
+                self.splunk_query(
+                    slo_config["spec"]["service_level_indicator"]["search_query_bad"],
+                    "bad",
+                    **kwargs_search,
+                )
+            )
+        elif "search_query_valid" in slo_config["spec"]["service_level_indicator"]:
+            result_bad = (
+                int(
+                    self.splunk_query(
+                        slo_config["spec"]["service_level_indicator"][
+                            "search_query_valid"
+                        ],
+                        "valid",
+                        **kwargs_search,
+                    )
+                )
+                - result_good
+            )
+
+        return (result_good, result_bad)
+
+    def query_sli(self, timestamp, window, slo_config):
+        """Query SLI value directly.
+        Args:
+            timestamp (int): UNIX timestamp.
+            window (int): Window (in seconds).
+            slo_config (dict): SLO configuration.
+              spec:
+                method: "query_sli"
+                service_level_indicator:
+                  search_query (str): the search query to run.
+                                      must return a single row
+                                      with at least a column named "sli"
+        Returns:
+            float: SLI value.
+        """
+        kwargs_search = {
+            "earliest_time": f"-{window}s",
+            "latest_time": timestamp,
+            "output_mode": "json",
+        }
+        result = self.splunk_query(
+            slo_config["spec"]["service_level_indicator"]["search_query"],
+            "sli",
+            **kwargs_search,
+        )
+        return result["sli"]
+
+    @staticmethod
+    def fix_search_prefix(search=""):
+        """
+        Splunk API search queries must start with "search"
+        but people are used to the search bar of the UI which doesn't
+
+        Args:
+            search(string): the search to execute
+        Returns:
+            The same string prefixed with "search " if needed
+        """
+        if not re.search("^search ", search):
+            search = f"search {search}"
+        return search
+
+    def splunk_query(self, search="", result_column="", **kwargs_search):
+        """
+        Cleanup and sent the search query to splunk
+        and return the content of the first row of the choosen column
+
+        Args:
+            search(string): the search string to run against Splunk
+            result_column(string): the column to look for in the results of the search
+            kwargs_search(item): search parameters
+                                 as described in the Splunk oneshot search API
+        Returns
+            The value of the first row of the results for the selected column
+        """
+        search_query = self.fix_search_prefix(search)
+        result_json = self.client.jobs.oneshot(search_query, **kwargs_search)
+        return json.loads(str(result_json))["results"][0][result_column]
diff --git a/tests/unit/backends/test_splunk.py b/tests/unit/backends/test_splunk.py
@@ -0,0 +1,28 @@
+# Copyright 2022 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+import unittest
+
+from slo_generator.backends.splunk import SplunkBackend
+
+
+class TestSplunkBackend(unittest.TestCase):
+    def test_fix_search_prefix(search):
+        search: str = "index=* status=200"
+        fixed_search = "search " + search
+
+        assert SplunkBackend.fix_search_prefix(search) == fixed_search
+        assert SplunkBackend.fix_search_prefix(fixed_search) == fixed_search
diff --git a/tests/unit/fixtures/splunk_generic_response.json b/tests/unit/fixtures/splunk_generic_response.json
@@ -0,0 +1,13 @@
+{
+  "init_offset" : 0,
+  "messages" : [],
+  "preview" : false,
+  "results": [
+    {
+      "good":"13666",
+      "bad":"2",
+      "valid":"13668",
+      "sli":"99.99"
+    }
+  ]
+}