From e7e6575130c5ac7ad68ce854b037cbad4ddacadf Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Thu, 20 Jul 2017 23:39:47 -0400 Subject: [PATCH] openshift_checks: refactor logging checks Turn failure messages into exceptions that tests can look for without depending on text meant for humans. Turn logging_namespace property into a method. Get rid of _exec_oc and just use logging.exec_oc. --- .../openshift_checks/__init__.py | 29 ++- .../openshift_checks/logging/curator.py | 37 +--- .../openshift_checks/logging/elasticsearch.py | 178 +++++++-------- .../openshift_checks/logging/fluentd.py | 197 ++++++++--------- .../openshift_checks/logging/kibana.py | 209 +++++++++--------- .../openshift_checks/logging/logging.py | 54 +++-- .../logging/logging_index_time.py | 75 ++++--- .../test/curator_test.py | 45 ++-- .../test/elasticsearch_test.py | 139 +++++++----- .../test/fluentd_test.py | 63 +++--- .../test/kibana_test.py | 160 ++++++++------ .../test/logging_check_test.py | 49 ++-- .../test/logging_index_time_test.py | 82 +++---- 13 files changed, 685 insertions(+), 632 deletions(-) diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index cbf7c3e5f44..07eebe6c5cb 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -13,8 +13,30 @@ class OpenShiftCheckException(Exception): - """Raised when a check cannot proceed.""" - pass + """Raised when a check encounters a failure condition.""" + + def __init__(self, name, msg=None): + # msg is for the message the user will see when this is raised. + # name is for test code to identify the error without looking at msg text. + if msg is None: # for parameter backward compatibility + msg = name + name = self.__class__.__name__ + self.name = name + super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): + """A container for multiple logging errors that may be detected in one check.""" + def __init__(self, errors): + self.errors = errors + super(OpenShiftCheckExceptionList, self).__init__( + 'OpenShiftCheckExceptionList', + '\n'.join(str(msg) for msg in errors) + ) + + # make iterable + def __getitem__(self, index): + return self.errors[index] @six.add_metaclass(ABCMeta) @@ -34,7 +56,8 @@ def __init__(self, execute_module=None, task_vars=None, tmp=None): self._execute_module = execute_module self.task_vars = task_vars or {} self.tmp = tmp - # set True when the check makes a change to the host so it can be reported to the user: + + # set to True when the check changes the host, for accurate total "changed" count self.changed = False @abstractproperty diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index c8bbdcbdad9..f56fdde46a7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,6 +1,6 @@ """Check for an aggregated logging Curator deployment""" -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck class Curator(LoggingCheck): @@ -9,46 +9,33 @@ class Curator(LoggingCheck): name = "curator" tags = ["health", "logging"] - logging_namespace = None - def run(self): - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - curator_pods, error = super(Curator, self).get_pods_for_component( - self.logging_namespace, - "curator", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_curator(curator_pods) - - if check_error: - msg = ("The following Curator deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Curator deployment.'} + + return {} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" if not pods: - return ( + raise OpenShiftCheckException( + "MissingComponentPods", "There are no Curator pods for the logging stack,\n" "so nothing will prune Elasticsearch indexes.\n" "Is Curator correctly deployed?" ) - not_running = super(Curator, self).not_running_pods(pods) + not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return ( + raise OpenShiftCheckException( + "CuratorNotRunning", "The Curator pod is not currently in a running state,\n" "so Elasticsearch indexes may increase without bound." ) if len(pods) - len(not_running) > 1: - return ( + raise OpenShiftCheckException( + "TooManyCurators", "There is more than one Curator pod running. This should not normally happen.\n" "Although this doesn't cause any problems, you may want to investigate." ) - - return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 3215d96881e..f9d5d8d65b1 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -3,6 +3,7 @@ import json import re +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -12,174 +13,179 @@ class Elasticsearch(LoggingCheck): name = "elasticsearch" tags = ["health", "logging"] - logging_namespace = None - def run(self): - """Check various things and gather errors. Returns: result as hash""" - - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - es_pods, error = super(Elasticsearch, self).get_pods_for_component( - self.logging_namespace, - "es", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_elasticsearch(es_pods) - - if check_error: - msg = ("The following Elasticsearch deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'} - def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of pods that are not running, list of errors about non-running pods""" - not_running = super(Elasticsearch, self).not_running_pods(es_pods) - if not_running: - return not_running, [( - 'The following Elasticsearch pods are not running:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - ))] - return not_running, [] + return {} def check_elasticsearch(self, es_pods): - """Various checks for elasticsearch. Returns: error string""" - not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) - running_pods = [pod for pod in es_pods if pod not in not_running_pods] + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) pods_by_name = { pod['metadata']['name']: pod for pod in running_pods # Filter out pods that are not members of a DC if pod['metadata'].get('labels', {}).get('deploymentconfig') } if not pods_by_name: - return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name) - error_msgs += self._check_elasticsearch_node_list(pods_by_name) - error_msgs += self._check_es_cluster_health(pods_by_name) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name) - return '\n'.join(error_msgs) + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] @staticmethod def _build_es_curl_cmd(pod_name, url): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name): - """Check that Elasticsearch masters are sane. Returns: list of error strings""" + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" es_master_names = set() - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self._exec_oc(get_master_cmd, []) + master_name_str = self.exec_oc(get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) else: - error_msgs.append( - 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' ' {response}'.format(pod=pod_name, response=master_name_str) - ) + )) if not es_master_names: - error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') - return '\n'.join(error_msgs) + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors if len(es_master_names) > 1: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', 'Found multiple Elasticsearch masters according to the pods:\n' '{master_list}\n' 'This implies that the masters have "split brain" and are not correctly\n' 'replicating data for the logging cluster. Log loss is likely to occur.' .format(master_list='\n'.join(' ' + master for master in es_master_names)) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_node_list(self, pods_by_name): - """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" if not pods_by_name: - return ['No logging Elasticsearch masters were found. Is logging deployed?'] + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self._exec_oc(node_cmd, []) + cluster_node_data = self.exec_oc(node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): - return [ + return [OpenShiftCheckException( + 'MissingNodeList', 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + cluster_node_data - ] + )] # Try to match all ES-reported node hosts to known pods. - error_msgs = [] + errors = [] for node in cluster_nodes.values(): # Note that with 1.4/3.4 the pod IP may be used as the master name if not any(node['host'] in (pod_name, pod['status'].get('podIP')) for pod_name, pod in pods_by_name.items()): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', 'The Elasticsearch cluster reports a member node "{node}"\n' 'that does not correspond to any known ES pod.'.format(node=node['host']) - ) + )) - return error_msgs + return errors - def _check_es_cluster_health(self, pods_by_name): + def check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self._exec_oc(cluster_health_cmd, []) + cluster_health_data = self.exec_oc(cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): raise ValueError() except ValueError: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadEsResponse', 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) - ) + )) continue if health_res['status'] not in ['green', 'yellow']: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_diskspace(self, pods_by_name): + def check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors """ - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self._exec_oc(df_cmd, []) + disk_output = self.exec_oc(df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadDfResponse', 'Could not retrieve storage usage from logging ES pod "{pod}".\n' 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) - ) + )) continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -187,10 +193,11 @@ def _check_elasticsearch_diskspace(self, pods_by_name): pct=str(inode_pct), limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', - )) + ))) disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -198,13 +205,6 @@ def _check_elasticsearch_diskspace(self, pods_by_name): pct=str(disk_pct), limit=str(disk_pct_thresh), param='openshift_check_efk_es_storage_pct', - )) - - return error_msgs + ))) - def _exec_oc(self, cmd_str, extra_args): - return super(Elasticsearch, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index bde33cc64f1..3b192a281f3 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@ import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -11,61 +13,97 @@ class Fluentd(LoggingCheck): name = "fluentd" tags = ["health", "logging"] - logging_namespace = None - def run(self): - """Check various things and gather errors. Returns: result as hash""" + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.logging_namespace, - "fluentd", + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_fluentd(fluentd_pods) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Fluentd deployment.'} + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): - """Note if nodes are not labeled as expected. Returns: error string""" + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -85,83 +123,32 @@ def _check_nodes_have_fluentd(pods, fluentd_nodes): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = self.get_var( - 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true' - ) - - nodes_by_name, error = self.get_nodes_by_name() - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self._exec_oc("get nodes -o json", []) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None - - def _exec_oc(self, cmd_str, extra_args): - return super(Fluentd, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index b00b0ed5e3a..dc9963a6d8b 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,7 @@ from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException class Kibana(LoggingCheck): @@ -21,31 +21,15 @@ class Kibana(LoggingCheck): name = "kibana" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - kibana_pods, error = super(Kibana, self).get_pods_for_component( - self.logging_namespace, - "kibana", - ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_kibana(kibana_pods) - - if not check_error: - check_error = self._check_kibana_route() - - if check_error: - msg = ("The following Kibana deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "msg": msg} - + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Kibana deployment.'} + + return {} def _verify_url_internal(self, url): """ @@ -68,7 +52,7 @@ def _verify_url_internal(self, url): def _verify_url_external(url): """ Try to reach a URL from ansible control host. - Returns: success (bool), reason (for failure) + Raise an OpenShiftCheckException if anything goes wrong. """ # This actually checks from the ansible control host, which may or may not # really be "external" to the cluster. @@ -94,133 +78,148 @@ def _verify_url_external(url): return None def check_kibana(self, pods): - """Check to see if Kibana is up and working. Returns: error string.""" + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" if not pods: - return "There are no Kibana pods deployed, so no access to the logging UI." + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return "No Kibana pod is in a running state, so there is no access to the logging UI." + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) elif not_running: - return ( + raise OpenShiftCheckException( + "PodNotRunning", "The following Kibana pods are not currently in a running state:\n" - "{pods}" - "However at least one is, so service may not be impacted." - ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) - - return None + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) def _get_kibana_url(self): """ Get kibana route or report error. - Returns: url (or empty), reason for failure + Returns: url """ # Get logging url - get_route = self._exec_oc("get route logging-kibana -o json", []) + get_route = self.exec_oc("get route logging-kibana -o json", []) if not get_route: - return None, 'no_route_exists' + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) - route = json.loads(get_route) + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) - # check that the route has been accepted by a router - ingress = route["status"]["ingress"] # ingress can be null if there is no router, or empty if not routed if not ingress or not ingress[0]: - return None, 'route_not_accepted' + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) host = route.get("spec", {}).get("host") if not host: - return None, 'route_missing_host' + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) - return 'https://{}/'.format(host), None + return 'https://{}/'.format(host) - def _check_kibana_route(self): + def check_kibana_route(self): """ Check to see if kibana route is up and working. - Returns: error string + Raises exception if not. """ - known_errors = dict( - no_route_exists=( - 'No route is defined for Kibana in the logging namespace,\n' - 'so the logging stack is not accessible. Is logging deployed?\n' - 'Did something remove the logging-kibana route?' - ), - route_not_accepted=( - 'The logging-kibana route is not being routed by any router.\n' - 'Is the router deployed and working?' - ), - route_missing_host=( - 'The logging-kibana route has no hostname defined,\n' - 'which should never happen. Did something alter its definition?' - ), - ) - kibana_url, error = self._get_kibana_url() - if not kibana_url: - return known_errors.get(error, error) + kibana_url = self._get_kibana_url() # first, check that kibana is reachable from the master. error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToConnectInternal', 'Failed to connect from this master to Kibana URL {url}\n' - 'Is kibana running, and is at least one router routing to it?' - ).format(url=kibana_url) + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToResolveInternal', 'Failed to connect from this master to Kibana URL {url}\n' 'because the hostname does not resolve.\n' - 'Is DNS configured for the Kibana hostname?' - ).format(url=kibana_url) + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) elif 'Status code was not' in error: - error = ( + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', 'A request from this master to the Kibana URL {url}\n' 'did not return the correct status code (302).\n' 'This could mean that Kibana is malfunctioning, the hostname is\n' 'resolving incorrectly, or other network issues. The output was:\n' - ' {error}' - ).format(url=kibana_url, error=error) - return 'Error validating the logging Kibana route:\n' + error + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not self.get_var("openshift_check_efk_kibana_external", default=True): - return None + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return error = self._verify_url_external(kibana_url) - if error: - if 'urlopen error [Errno 111] Connection refused' in error: - error = ( - 'Failed to connect from the Ansible control host to Kibana URL {url}\n' - 'Is the router for the Kibana hostname exposed externally?' - ).format(url=kibana_url) - elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( - 'Failed to resolve the Kibana hostname in {url}\n' - 'from the Ansible control host.\n' - 'Is DNS configured to resolve this Kibana hostname externally?' - ).format(url=kibana_url) - elif 'Expected success (200)' in error: - error = ( - 'A request to Kibana at {url}\n' - 'returned the wrong error code:\n' - ' {error}\n' - 'This could mean that Kibana is malfunctioning, the hostname is\n' - 'resolving incorrectly, or other network issues.' - ).format(url=kibana_url, error=error) - error = ( - 'Error validating the logging Kibana route:\n{error}\n' - 'To disable external Kibana route validation, set in your inventory:\n' - ' openshift_check_efk_kibana_external=False' - ).format(error=error) - return error - return None + if not error: + return - def _exec_oc(self, cmd_str, extra_args): - return super(Kibana, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 43ba6c406ff..3b7c39760f2 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -8,6 +8,16 @@ from openshift_checks import OpenShiftCheck, OpenShiftCheckException +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + class LoggingCheck(OpenShiftCheck): """Base class for OpenShift aggregated logging component checks""" @@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck): # run by itself. name = "logging" - logging_namespace = "logging" def is_active(self): logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) @@ -32,22 +41,24 @@ def is_first_master(self): def run(self): return {} - def get_pods_for_component(self, namespace, logging_component): - """Get all pods for a given component. Returns: list of pods for component, error string""" + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" pod_output = self.exec_oc( - namespace, "get pods -l component={} -o json".format(logging_component), [], ) try: - pods = json.loads(pod_output) - if not pods or not pods.get('items'): + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same raise ValueError() except ValueError: - # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'No pods were found for the "{}" logging component.'.format(logging_component) + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) - return pods['items'], None + return pods['items'] @staticmethod def not_running_pods(pods): @@ -63,15 +74,19 @@ def not_running_pods(pods): ) ] - def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, - or raises OpenShiftCheckException on error + or raises CouldNotUseOc on error """ config_base = self.get_var("openshift", "common", "config_base") args = { - "namespace": namespace, + "namespace": self.logging_namespace(), "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), "cmd": cmd_str, "extra_args": list(extra_args) if extra_args else [], @@ -79,17 +94,16 @@ def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): result = self.execute_module("ocutil", args) if result.get("failed"): - msg = ( - 'Unexpected error using `oc` to validate the logging stack components.\n' - 'Error executing `oc {cmd}`:\n' - '{error}' - ).format(cmd=args['cmd'], error=result['result']) - if result['result'] == '[Errno 2] No such file or directory': - msg = ( + raise CouldNotUseOc( "This host is supposed to be a master but does not have the `oc` command where expected.\n" "Has an installation been run on this host yet?" ) - raise OpenShiftCheckException(msg) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index b24e88e056c..d781db64979 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck): name = "logging_index_time" tags = ["health", "logging"] - logging_namespace = "logging" - def run(self): """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" try: @@ -28,29 +26,25 @@ def run(self): self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) ) except ValueError: - return { - "failed": True, - "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' - 'Value must be an integer representing an amount in seconds.'), - } + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) running_component_pods = dict() # get all component pods - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): - pods, error = self.get_pods_for_component(self.logging_namespace, component) - - if error: - msg = 'Unable to retrieve pods for the {} logging component: {}' - return {"failed": True, "changed": False, "msg": msg.format(name, error)} - + pods = self.get_pods_for_component(component) running_pods = self.running_pods(pods) if not running_pods: - msg = ('No {} pods in the "Running" state were found.' - 'At least one pod is required in order to perform this check.') - return {"failed": True, "changed": False, "msg": msg.format(name)} + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) running_component_pods[component] = running_pods @@ -65,8 +59,11 @@ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs): interval = 1 while not self.query_es_from_es(es_pod, uuid): if time.time() + interval > deadline: - msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." - raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) time.sleep(interval) def curl_kibana_with_uuid(self, kibana_pod): @@ -76,22 +73,23 @@ def curl_kibana_with_uuid(self, kibana_pod): exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) - error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) + error_str = self.exec_oc(exec_cmd, []) try: error_code = json.loads(error_str)["statusCode"] - except KeyError: - msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) - except ValueError: - msg = ('invalid response returned from Kibana request (Non-JSON output):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) if error_code != 404: - msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' - raise OpenShiftCheckException(msg.format(error_code)) + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) return uuid @@ -105,17 +103,18 @@ def query_es_from_es(self, es_pod, uuid): "--key /etc/elasticsearch/secret/admin-key " "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" ) - exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) - result = self.exec_oc(self.logging_namespace, exec_cmd, []) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, []) try: count = json.loads(result)["count"] - except KeyError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) - except ValueError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) return count diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py index ae108c96e0f..62c680b74eb 100644 --- a/roles/openshift_health_checker/test/curator_test.py +++ b/roles/openshift_health_checker/test/curator_test.py @@ -1,22 +1,6 @@ import pytest -from openshift_checks.logging.curator import Curator - - -def canned_curator(exec_oc=None): - """Create a Curator check object with canned exec_oc method""" - check = Curator("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.curator import Curator, OpenShiftCheckException plain_curator_pod = { @@ -44,25 +28,30 @@ def assert_error(error, expect_error): } +def test_get_curator_pods(): + check = Curator() + check.get_pods_for_component = lambda *_: [plain_curator_pod] + result = check.run() + assert "failed" not in result or not result["failed"] + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "no Curator pods", - ), - ( - [plain_curator_pod], - None, + 'MissingComponentPods', ), ( [not_running_curator_pod], - "not currently in a running state", + 'CuratorNotRunning', ), ( [plain_curator_pod, plain_curator_pod], - "more than one Curator pod", + 'TooManyCurators', ), ]) -def test_get_curator_pods(pods, expect_error): - check = canned_curator() - error = check.check_curator(pods) - assert_error(error, expect_error) +def test_get_curator_pods_fail(pods, expect_error): + check = Curator() + check.get_pods_for_component = lambda *_: pods + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run() + assert excinfo.value.name == expect_error diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py index 9edfc17c746..09bacd9ac0b 100644 --- a/roles/openshift_health_checker/test/elasticsearch_test.py +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -1,25 +1,26 @@ import pytest import json -from openshift_checks.logging.elasticsearch import Elasticsearch +from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList + task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) def canned_elasticsearch(task_vars=None, exec_oc=None): - """Create an Elasticsearch check object with canned exec_oc method""" - check = Elasticsearch("dummy", task_vars or {}) # fails if a module is actually invoked + """Create an Elasticsearch check object with stubbed exec_oc method""" + check = Elasticsearch(None, task_vars or {}) if exec_oc: - check._exec_oc = exec_oc + check.exec_oc = exec_oc return check -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +def pods_by_name(pods): + return {pod['metadata']['name']: pod for pod in pods} plain_es_pod = { @@ -27,6 +28,7 @@ def assert_error(error, expect_error): "labels": {"component": "es", "deploymentconfig": "logging-es"}, "name": "logging-es", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -40,6 +42,7 @@ def assert_error(error, expect_error): "labels": {"component": "es", "deploymentconfig": "logging-es-2"}, "name": "logging-es-2", }, + "spec": {}, "status": { "conditions": [{"status": "True", "type": "Ready"}], "containerStatuses": [{"ready": True}], @@ -48,12 +51,28 @@ def assert_error(error, expect_error): "_test_master_name_str": "name logging-es-2", } +unready_es_pod = { + "metadata": { + "labels": {"component": "es", "deploymentconfig": "logging-es-3"}, + "name": "logging-es-3", + }, + "spec": {}, + "status": { + "conditions": [{"status": "False", "type": "Ready"}], + "containerStatuses": [{"ready": False}], + "podIP": "10.10.10.10", + }, + "_test_master_name_str": "BAD_NAME_RESPONSE", +} + def test_check_elasticsearch(): - assert 'No logging Elasticsearch pods' in canned_elasticsearch().check_elasticsearch([]) + with pytest.raises(OpenShiftCheckExceptionList) as excinfo: + canned_elasticsearch().check_elasticsearch([]) + assert_error_in_list('NoRunningPods', excinfo.value) # canned oc responses to match so all the checks pass - def _exec_oc(cmd, args): + def exec_oc(cmd, args): if '_cat/master' in cmd: return 'name logging-es' elif '/_nodes' in cmd: @@ -65,33 +84,41 @@ def _exec_oc(cmd, args): else: raise Exception(cmd) - assert not canned_elasticsearch({}, _exec_oc).check_elasticsearch([plain_es_pod]) + check = canned_elasticsearch({}, exec_oc) + check.get_pods_for_component = lambda *_: [plain_es_pod] + assert {} == check.run() -def pods_by_name(pods): - return {pod['metadata']['name']: pod for pod in pods} +def test_check_running_es_pods(): + pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod]) + assert plain_es_pod in pods + assert_error_in_list('PodNotRunning', errors) + + +def test_check_elasticsearch_masters(): + pods = [plain_es_pod] + check = canned_elasticsearch(task_vars_config_base, lambda *_: plain_es_pod['_test_master_name_str']) + assert not check.check_elasticsearch_masters(pods_by_name(pods)) @pytest.mark.parametrize('pods, expect_error', [ ( [], - 'No logging Elasticsearch masters', + 'NoMasterFound', ), ( - [plain_es_pod], - None, + [unready_es_pod], + 'NoMasterName', ), ( [plain_es_pod, split_es_pod], - 'Found multiple Elasticsearch masters', + 'SplitBrainMasters', ), ]) -def test_check_elasticsearch_masters(pods, expect_error): +def test_check_elasticsearch_masters_error(pods, expect_error): test_pods = list(pods) - check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: test_pods.pop(0)['_test_master_name_str']) - - errors = check._check_elasticsearch_masters(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + check = canned_elasticsearch(task_vars_config_base, lambda *_: test_pods.pop(0)['_test_master_name_str']) + assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods))) es_node_list = { @@ -101,80 +128,76 @@ def test_check_elasticsearch_masters(pods, expect_error): }}} +def test_check_elasticsearch_node_list(): + check = canned_elasticsearch(task_vars_config_base, lambda *_: json.dumps(es_node_list)) + assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod])) + + @pytest.mark.parametrize('pods, node_list, expect_error', [ ( [], {}, - 'No logging Elasticsearch masters', - ), - ( - [plain_es_pod], - es_node_list, - None, + 'MissingComponentPods', ), ( [plain_es_pod], {}, # empty list of nodes triggers KeyError - "Failed to query", + 'MissingNodeList', ), ( [split_es_pod], es_node_list, - 'does not correspond to any known ES pod', + 'EsPodNodeMismatch', ), ]) -def test_check_elasticsearch_node_list(pods, node_list, expect_error): +def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error): check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(node_list)) + assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods))) - errors = check._check_elasticsearch_node_list(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_cluster_health(): + test_health_data = [{"status": "green"}] + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert not check.check_es_cluster_health(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('pods, health_data, expect_error', [ - ( - [plain_es_pod], - [{"status": "green"}], - None, - ), ( [plain_es_pod], [{"no-status": "should bomb"}], - 'Could not retrieve cluster health status', + 'BadEsResponse', ), ( [plain_es_pod, split_es_pod], [{"status": "green"}, {"status": "red"}], - 'Elasticsearch cluster health status is RED', + 'EsClusterHealthRed', ), ]) -def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): +def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error): test_health_data = list(health_data) - check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(test_health_data.pop(0))) + check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) + assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods))) - errors = check._check_es_cluster_health(pods_by_name(pods)) - assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_diskspace(): + check = canned_elasticsearch(exec_oc=lambda *_: 'IUse% Use%\n 3% 4%\n') + assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) @pytest.mark.parametrize('disk_data, expect_error', [ ( 'df: /elasticsearch/persistent: No such file or directory\n', - 'Could not retrieve storage usage', - ), - ( - 'IUse% Use%\n 3% 4%\n', - None, + 'BadDfResponse', ), ( 'IUse% Use%\n 95% 40%\n', - 'Inode percent usage on the storage volume', + 'InodeUsageTooHigh', ), ( 'IUse% Use%\n 3% 94%\n', - 'Disk percent usage on the storage volume', + 'DiskUsageTooHigh', ), ]) -def test_check_elasticsearch_diskspace(disk_data, expect_error): - check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: disk_data) - - errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) - assert_error(''.join(errors), expect_error) +def test_check_elasticsearch_diskspace_errors(disk_data, expect_error): + check = canned_elasticsearch(exec_oc=lambda *_: disk_data) + assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))) diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py index 9cee5786875..e7bf9818b0e 100644 --- a/roles/openshift_health_checker/test/fluentd_test.py +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -1,23 +1,11 @@ import pytest import json -from openshift_checks.logging.fluentd import Fluentd +from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException -def canned_fluentd(exec_oc=None): - """Create a Fluentd check object with canned exec_oc method""" - check = Fluentd("dummy") # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +def assert_error_in_list(expect_err, errorlist): + assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) fluentd_pod_node1 = { @@ -65,45 +53,60 @@ def assert_error(error, expect_error): } +def test_get_fluentd_pods(): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1])) + check.get_pods_for_component = lambda *_: [fluentd_pod_node1] + assert not check.run() + + @pytest.mark.parametrize('pods, nodes, expect_error', [ ( [], [], - 'No nodes appear to be defined', + 'NoNodesDefined', ), ( [], [fluentd_node3_unlabeled], - 'There are no nodes with the fluentd label', + 'NoNodesLabeled', ), ( [], [fluentd_node1, fluentd_node3_unlabeled], - 'Fluentd will not aggregate logs from these nodes.', + 'NodesUnlabeled', ), ( [], [fluentd_node2], - "nodes are supposed to have a Fluentd pod but do not", + 'MissingFluentdPod', ), ( [fluentd_pod_node1, fluentd_pod_node1], [fluentd_node1], - 'more Fluentd pods running than nodes labeled', + 'TooManyFluentdPods', ), ( [fluentd_pod_node2_down], [fluentd_node2], - "Fluentd pods are supposed to be running", - ), - ( - [fluentd_pod_node1], - [fluentd_node1], - None, + 'FluentdNotRunning', ), ]) -def test_get_fluentd_pods(pods, nodes, expect_error): - check = canned_fluentd(exec_oc=lambda cmd, args: json.dumps(dict(items=nodes))) +def test_get_fluentd_pods_errors(pods, nodes, expect_error): + check = Fluentd() + check.exec_oc = lambda *_: json.dumps(dict(items=nodes)) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_fluentd(pods) + if isinstance(excinfo.value, OpenShiftCheckExceptionList): + assert_error_in_list(expect_error, excinfo.value) + else: + assert expect_error == excinfo.value.name + - error = check.check_fluentd(pods) - assert_error(error, expect_error) +def test_bad_oc_node_list(): + check = Fluentd() + check.exec_oc = lambda *_: "this isn't even json" + with pytest.raises(OpenShiftCheckException) as excinfo: + check.get_nodes_by_name() + assert 'BadOcNodeList' == excinfo.value.name diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py index 3a880d3002f..04a5e89c481 100644 --- a/roles/openshift_health_checker/test/kibana_test.py +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -8,23 +8,7 @@ from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.kibana import Kibana - - -def canned_kibana(exec_oc=None): - """Create a Kibana check object with canned exec_oc method""" - check = Kibana() # fails if a module is actually invoked - if exec_oc: - check._exec_oc = exec_oc - return check - - -def assert_error(error, expect_error): - if expect_error: - assert error - assert expect_error in error - else: - assert not error +from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException plain_kibana_pod = { @@ -49,39 +33,45 @@ def assert_error(error, expect_error): } +def test_check_kibana(): + # should run without exception: + Kibana().check_kibana([plain_kibana_pod]) + + @pytest.mark.parametrize('pods, expect_error', [ ( [], - "There are no Kibana pods deployed", - ), - ( - [plain_kibana_pod], - None, + "MissingComponentPods", ), ( [not_running_kibana_pod], - "No Kibana pod is in a running state", + "NoRunningPods", ), ( [plain_kibana_pod, not_running_kibana_pod], - "The following Kibana pods are not currently in a running state", + "PodNotRunning", ), ]) -def test_check_kibana(pods, expect_error): - check = canned_kibana() - error = check.check_kibana(pods) - assert_error(error, expect_error) +def test_check_kibana_error(pods, expect_error): + with pytest.raises(OpenShiftCheckException) as excinfo: + Kibana().check_kibana(pods) + assert expect_error == excinfo.value.name -@pytest.mark.parametrize('route, expect_url, expect_error', [ +@pytest.mark.parametrize('comment, route, expect_error', [ ( + "No route returned", None, - None, - 'no_route_exists', + "no_route_exists", ), - # test route with no ingress ( + "broken route response", + {"status": {}}, + "get_route_failed", + ), + ( + "route with no ingress", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -94,12 +84,11 @@ def test_check_kibana(pods, expect_error): "host": "hostname", } }, - None, - 'route_not_accepted', + "route_not_accepted", ), - # test route with no host ( + "route with no host", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -112,12 +101,21 @@ def test_check_kibana(pods, expect_error): }, "spec": {}, }, - None, - 'route_missing_host', + "route_missing_host", ), +]) +def test_get_kibana_url_error(comment, route, expect_error): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) if route else "" - # test route that looks fine + with pytest.raises(OpenShiftCheckException) as excinfo: + check._get_kibana_url() + assert excinfo.value.name == expect_error + + +@pytest.mark.parametrize('comment, route, expect_url', [ ( + "test route that looks fine", { "metadata": { "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -133,61 +131,57 @@ def test_check_kibana(pods, expect_error): }, }, "https://hostname/", - None, ), ]) -def test_get_kibana_url(route, expect_url, expect_error): - check = canned_kibana(exec_oc=lambda cmd, args: json.dumps(route) if route else "") - - url, error = check._get_kibana_url() - if expect_url: - assert url == expect_url - else: - assert not url - if expect_error: - assert error == expect_error - else: - assert not error +def test_get_kibana_url(comment, route, expect_url): + check = Kibana() + check.exec_oc = lambda *_: json.dumps(route) + assert expect_url == check._get_kibana_url() @pytest.mark.parametrize('exec_result, expect', [ ( 'urlopen error [Errno 111] Connection refused', - 'at least one router routing to it?', + 'FailedToConnectInternal', ), ( 'urlopen error [Errno -2] Name or service not known', - 'DNS configured for the Kibana hostname?', + 'FailedToResolveInternal', ), ( 'Status code was not [302]: HTTP Error 500: Server error', - 'did not return the correct status code', + 'WrongReturnCodeInternal', ), ( 'bork bork bork', - 'bork bork bork', # should pass through + 'MiscRouteErrorInternal', ), ]) def test_verify_url_internal_failure(exec_result, expect): check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result)) - check._get_kibana_url = lambda: ('url', None) + check._get_kibana_url = lambda: 'url' - error = check._check_kibana_route() - assert_error(error, expect) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name @pytest.mark.parametrize('lib_result, expect', [ ( - HTTPError('url', 500, "it broke", hdrs=None, fp=None), - 'it broke', + HTTPError('url', 500, 'it broke', hdrs=None, fp=None), + 'MiscRouteError', ), ( - URLError('it broke'), - 'it broke', + URLError('urlopen error [Errno 111] Connection refused'), + 'FailedToConnect', + ), + ( + URLError('urlopen error [Errno -2] Name or service not known'), + 'FailedToResolve', ), ( 302, - 'returned the wrong error code', + 'WrongReturnCode', ), ( 200, @@ -210,9 +204,41 @@ def urlopen(url, context): raise lib_result monkeypatch.setattr(urllib2, 'urlopen', urlopen) - check = canned_kibana() - check._get_kibana_url = lambda: ('url', None) + check = Kibana() + check._get_kibana_url = lambda: 'url' check._verify_url_internal = lambda url: None - error = check._check_kibana_route() - assert_error(error, expect) + if not expect: + check.check_kibana_route() + return + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.check_kibana_route() + assert expect == excinfo.value.name + + +def test_verify_url_external_skip(): + check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false")) + check._get_kibana_url = lambda: 'url' + check.check_kibana_route() + + +# this is kind of silly but it adds coverage for the run() method... +def test_run(): + pods = ["foo"] + ran = dict(check_kibana=False, check_route=False) + + def check_kibana(pod_list): + ran["check_kibana"] = True + assert pod_list == pods + + def check_kibana_route(): + ran["check_route"] = True + + check = Kibana() + check.get_pods_for_component = lambda *_: pods + check.check_kibana = check_kibana + check.check_kibana_route = check_kibana_route + + check.run() + assert ran["check_kibana"] and ran["check_route"] diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py index 6f1697ee659..1a1c190f628 100644 --- a/roles/openshift_health_checker/test/logging_check_test.py +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -1,18 +1,14 @@ import pytest import json -from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -logging_namespace = "logging" - - -def canned_loggingcheck(exec_oc=None): +def canned_loggingcheck(exec_oc=None, execute_module=None): """Create a LoggingCheck object with canned exec_oc method""" - check = LoggingCheck() # fails if a module is actually invoked - check.logging_namespace = 'logging' + check = LoggingCheck(execute_module) if exec_oc: check.exec_oc = exec_oc return check @@ -97,8 +93,8 @@ def execute_module(module_name, *_): check = LoggingCheck(execute_module, task_vars_config_base) - with pytest.raises(OpenShiftCheckException) as excinfo: - check.exec_oc(logging_namespace, 'get foo', []) + with pytest.raises(CouldNotUseOc) as excinfo: + check.exec_oc('get foo', []) assert expect in str(excinfo) @@ -124,25 +120,32 @@ def test_is_active(groups, logging_deployed, is_active): assert LoggingCheck(None, task_vars).is_active() == is_active -@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ +@pytest.mark.parametrize('pod_output, expect_pods', [ + ( + json.dumps({'items': [plain_es_pod]}), + [plain_es_pod], + ), +]) +def test_get_pods_for_component(pod_output, expect_pods): + check = canned_loggingcheck(lambda *_: pod_output) + pods = check.get_pods_for_component("es") + assert pods == expect_pods + + +@pytest.mark.parametrize('exec_oc_output, expect_error', [ ( 'No resources found.', - None, - 'No pods were found for the "es"', + MissingComponentPods, ), ( - json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), - [plain_es_pod], - None, + '{"items": null}', + MissingComponentPods, ), ]) -def test_get_pods_for_component(pod_output, expect_pods, expect_error): - check = canned_loggingcheck(lambda namespace, cmd, args: pod_output) - pods, error = check.get_pods_for_component( - logging_namespace, - "es", - ) - assert_error(error, expect_error) +def test_get_pods_for_component_fail(exec_oc_output, expect_error): + check = canned_loggingcheck(lambda *_: exec_oc_output) + with pytest.raises(expect_error): + check.get_pods_for_component("es") @pytest.mark.parametrize('name, pods, expected_pods', [ @@ -159,7 +162,7 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error): ], ids=lambda argvals: argvals[0]) def test_get_not_running_pods_no_container_status(name, pods, expected_pods): - check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '') + check = canned_loggingcheck(lambda *_: '') result = check.not_running_pods(pods) assert result == expected_pods diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py index 178d7cd84eb..22566b29551 100644 --- a/roles/openshift_health_checker/test/logging_index_time_test.py +++ b/roles/openshift_health_checker/test/logging_index_time_test.py @@ -69,7 +69,29 @@ def test_check_running_pods(pods, expect_pods): assert pods == expect_pods -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +def test_bad_config_param(): + with pytest.raises(OpenShiftCheckException) as error: + LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run() + assert 'InvalidTimeout' == error.value.name + + +def test_no_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [not_running_kibana_pod] + with pytest.raises(OpenShiftCheckException) as error: + check.run() + assert 'kibanaNoRunningPods' == error.value.name + + +def test_with_running_pods(): + check = LoggingIndexTime() + check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod] + check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID + check.wait_until_cmd_or_err = lambda *_: None + assert not check.run().get("failed") + + +@pytest.mark.parametrize('name, json_response, uuid, timeout', [ ( 'valid count in response', { @@ -77,94 +99,72 @@ def test_check_running_pods(pods, expect_pods): }, SAMPLE_UUID, 0.001, - [], ), ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +@pytest.mark.parametrize('name, json_response, timeout, expect_error', [ ( 'invalid json response', { "invalid_field": 1, }, - SAMPLE_UUID, 0.001, - ["invalid response", "Elasticsearch"], + 'esInvalidResponse', ), ( 'empty response', {}, - SAMPLE_UUID, 0.001, - ["invalid response", "Elasticsearch"], + 'esInvalidResponse', ), ( 'valid response but invalid match count', { "count": 0, }, - SAMPLE_UUID, 0.005, - ["expecting match", SAMPLE_UUID, "0.005s"], + 'NoMatchFound', ) ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) with pytest.raises(OpenShiftCheckException) as error: - check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) + check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout) - for word in extra_words: - assert word in str(error) + assert expect_error == error.value.name -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ - ( - 'correct response code, found unique id is returned', - { - "statusCode": 404, - }, - "sample unique id", - ["sample unique id"], - ), -], ids=lambda argval: argval[0]) -def test_curl_kibana_with_uuid(name, json_response, uuid, extra_words): - check = canned_loggingindextime(lambda *_: json.dumps(json_response)) - check.generate_uuid = lambda: uuid - - result = check.curl_kibana_with_uuid(plain_running_kibana_pod) - - for word in extra_words: - assert word in result +def test_curl_kibana_with_uuid(): + check = canned_loggingindextime(lambda *_: json.dumps({"statusCode": 404})) + check.generate_uuid = lambda: SAMPLE_UUID + assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod) -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ +@pytest.mark.parametrize('name, json_response, expect_error', [ ( 'invalid json response', { "invalid_field": "invalid", }, - SAMPLE_UUID, - ["invalid response returned", 'Missing "statusCode" key'], + 'kibanaInvalidResponse', ), ( 'wrong error code in response', { "statusCode": 500, }, - SAMPLE_UUID, - ["Expecting error code", "500"], + 'kibanaInvalidReturnCode', ), ], ids=lambda argval: argval[0]) -def test_failed_curl_kibana_with_uuid(name, json_response, uuid, extra_words): +def test_failed_curl_kibana_with_uuid(name, json_response, expect_error): check = canned_loggingindextime(lambda *_: json.dumps(json_response)) - check.generate_uuid = lambda: uuid + check.generate_uuid = lambda: SAMPLE_UUID with pytest.raises(OpenShiftCheckException) as error: check.curl_kibana_with_uuid(plain_running_kibana_pod) - for word in extra_words: - assert word in str(error) + assert expect_error == error.value.name