Merge branch 'master' into paulway_improve_topological_sort

RedHatInsights · Feb 6, 2023 · 65dec4a · 65dec4a
2 parents 2574f19 + da20d33
commit 65dec4a
Show file tree

Hide file tree

Showing 353 changed files with 1,600 additions and 1,197 deletions.
diff --git a/docs/api_index.rst b/docs/api_index.rst
@@ -30,6 +30,14 @@ insights.core.dr
     :members:
     :exclude-members: requires, optional, metadata, group, tags
 
+insights.core.exceptions
+------------------------
+
+.. automodule:: insights.core.exceptions
+    :members:
+    :show-inheritance:
+    :undoc-members:
+
 insights.core.filters
 ---------------------
 
@@ -74,8 +82,8 @@ insights.parsers
 ----------------
 
 .. automodule:: insights.parsers
-    :members: ParseException, SkipException, calc_offset, get_active_lines,
-              keyword_search, optlist_to_dict, parse_delimited_table,
+    :members: calc_offset, get_active_lines, keyword_search,
+              optlist_to_dict, parse_delimited_table,
               parse_fixed_table, split_kv_pairs, unsplit_lines
     :show-inheritance:
     :undoc-members:

diff --git a/docs/exception_model.rst b/docs/exception_model.rst
@@ -82,7 +82,7 @@ any exceptions in the data (“dirty parser”). This allows rules that don’t
 exceptions to rely on only the first parser, and those rules will not run if valid data
 is not present.  If the dirty parser identifies errors in the data then it will save
 information regarding the errors for use by rules.  If no errors are found in the data
-then the dirty parser will raise :py:class:`insights.core.exceptions.SkipException`
+then the dirty parser will raise :py:class:`insights.core.exceptions.SkipComponent`
 to indicate to the engine that it should be removed from the dependency hierarchy.
 
 Other Exceptions from Parsers
@@ -99,15 +99,13 @@ types aren’t important and such checks may limit expressiveness and flexibilit
 Parsers should not use the assert statement in place of error handling code.
 Asserts are for debugging purposes only.
 
-SkipComponent and SkipException
-===============================
+SkipComponent
+=============
 
 Any component may raise `SkipComponent` to signal to the engine that
 nothing is wrong but that the component should be taken out of dependency
 resolution. This is useful if a component's dependencies are met but it's
 still unable to produce a meaningful result.
-:py:class:`insights.core.exceptions.SkipException` is a specialization of this for the
-dirty parser use case above, but it's treated the same as `SkipComponent`.
 
 Exception Recognition by the Insights Engine
 ============================================

diff --git a/docs/shared_parsers_catalog/blacklisted.rst b/docs/shared_parsers_catalog/blacklisted.rst
@@ -0,0 +1,3 @@
+.. automodule:: insights.parsers.blacklisted
+   :members:
+   :show-inheritance:
diff --git a/insights/__init__.py b/insights/__init__.py
@@ -30,10 +30,10 @@
 from insights.core import (CommandParser, ContainerParser, FileListing, IniConfigFile, JSONParser, LegacyItemAccess,  # noqa: F401
                            LogFileOutput, Parser, Scannable, SysconfigOptions, Syslog, XMLParser, YAMLParser, dr,  # noqa: F401
                            taglang)
-from insights.core.archives import COMPRESSION_TYPES, InvalidArchive, InvalidContentType, extract
+from insights.core.archives import COMPRESSION_TYPES, extract
 from insights.core.context import (ClusterArchiveContext, ExecutionContext, HostContext,  # noqa: F401
                                    HostArchiveContext, SerializedArchiveContext)
-from insights.core.exceptions import SkipComponent  # noqa: F401
+from insights.core.exceptions import InvalidArchive, InvalidContentType, SkipComponent  # noqa: F401
 from insights.core.filters import add_filter, apply_filters, get_filters  # noqa: F401
 from insights.core.hydration import create_context, initialize_broker  # noqa: F401
 from insights.core.plugins import (combiner, condition, datasource, fact, incident, make_fail, make_fingerprint,  # noqa: F401
@@ -266,9 +266,8 @@ def _load_context(path):
     return dr.get_component(path)
 
 
-def run(component=None, root=None, print_summary=False,
-        context=None, inventory=None, print_component=None):
-
+def run(component=None, root=None, print_summary=False, context=None, inventory=None, print_component=None,
+        store_skips=False):
     args = None
     formatters = None
 
@@ -293,6 +292,8 @@ def run(component=None, root=None, print_summary=False,
         p.add_argument("--context", help="Execution Context. Defaults to HostContext if an archive isn't passed.")
         p.add_argument("--no-load-default", help="Don't load the default plugins.", action="store_true")
         p.add_argument("--parallel", help="Execute rules in parallel.", action="store_true")
+        p.add_argument("--show-skips", help="Capture skips in the broker for troubleshooting.", action="store_true",
+                       default=False)
         p.add_argument("--tags", help="Expression to select rules by tag.")
 
         class Args(object):
@@ -385,6 +386,10 @@ class Args(object):
         graph = dr.COMPONENTS[dr.GROUPS.single]
 
     broker = dr.Broker()
+    if args:
+        broker.store_skips = args.show_skips
+    else:
+        broker.store_skips = store_skips
 
     if args and args.bare:
         ctx = ExecutionContext()  # dummy context that no spec depend on. needed for filters to work

diff --git a/insights/client/apps/malware_detection/__init__.py b/insights/client/apps/malware_detection/__init__.py
@@ -7,7 +7,7 @@
 import logging
 from glob import glob
 from datetime import datetime
-from tempfile import NamedTemporaryFile
+from tempfile import NamedTemporaryFile, gettempdir
 try:
     # python 2
     from urllib import quote as urlencode
@@ -21,7 +21,8 @@
 from insights.client.utilities import (
     generate_machine_id, write_data_to_file, get_time
 )
-from insights.util.subproc import call, CalledProcessError
+from insights.core.exceptions import CalledProcessError
+from insights.util.subproc import call
 
 logger = logging.getLogger(__name__)
 MIN_YARA_VERSION = "4.1.0"
@@ -186,6 +187,7 @@ def __init__(self, insights_config):
         self.add_metadata = self._get_config_option('add_metadata', False)
 
         self.matches = 0
+        self.potential_matches = 0
 
     def run(self):
         # Start the scans and record the time they were started
@@ -201,7 +203,11 @@ def run(self):
 
             # Write a message to user informing them if there were matches or not and what to do next
             if self.matches == 0:
-                logger.info("No rule matches found.\n")
+                if self.potential_matches == 0:
+                    logger.info("No rule matches found.\n")
+                else:
+                    logger.info("Rule matches potentially found but problems encountered parsing them, so no match data to upload.")
+                    logger.info("Please contact support.\n")
             else:
                 logger.info("Found %d rule match%s.", self.matches, 'es' if self.matches > 1 else '')
                 if not self.test_scan:
@@ -604,11 +610,12 @@ def _get_rules(self):
         # However it can happen that the rules file isn't removed for some reason, so remove any existing
         # rules files before beginning a new scan, otherwise they may show up as matches in the scan results.
         old_rules_files = sum([glob(os.path.join(path, rules))
-                               for path in ('/tmp', '/var/tmp')
+                               for path in ('/tmp', '/var/tmp', '/usr/tmp', gettempdir())
                                for rules in ('.tmpmdsigs*', 'tmp_malware-detection-client_rules.*')], [])
         for old_rules_file in old_rules_files:
-            logger.debug("Removing old rules file %s", old_rules_file)
-            os.remove(old_rules_file)
+            if os.path.exists(old_rules_file):
+                logger.debug("Removing old rules file %s", old_rules_file)
+                os.remove(old_rules_file)
 
         self.rules_location = self._get_config_option('rules_location', '')
 
@@ -741,8 +748,16 @@ def scan_filesystem(self):
             return False
 
         # Exclude the rules file and insights-client log files, unless they are things we specifically want to scan
-        if self.rules_file not in self.scan_fsobjects:
-            self.filesystem_scan_exclude_list.append(self.rules_file)
+        # Get a list of potential rules files locations,eg /tmp, /var/tmp, /usr/tmp and gettempdir()
+        # eg customers may have /tmp linked to /var/tmp so both must be checked for excluding the downloaded rules
+        rules_file_name = os.path.basename(self.rules_file)
+        potential_tmp_dirs = set([gettempdir(), '/tmp', '/var/tmp', '/usr/tmp'])
+        potential_rules_files = set(list(map(lambda d: os.path.join(d, rules_file_name), potential_tmp_dirs)) + [self.rules_file])
+        rules_files = list(filter(lambda f: os.path.isfile(f), potential_rules_files))
+        for rules_file in rules_files:
+            if rules_file not in self.scan_fsobjects:
+                self.filesystem_scan_exclude_list.append(rules_file)
+                logger.debug("Excluding rules file: %s", rules_file)
         insights_log_files = glob(constants.default_log_file + '*')
         self.filesystem_scan_exclude_list.extend(list(set(insights_log_files) - set(self.scan_fsobjects)))
 
@@ -795,7 +810,12 @@ def scan_filesystem(self):
                 logger.debug("Unable to scan %s: %s", toplevel_dir, cpe.output.strip())
                 continue
 
-            self.parse_scan_output(output.strip())
+            try:
+                self.parse_scan_output(output.strip())
+            except Exception as e:
+                self.potential_matches += 1
+                logger.exception("Rule match(es) potentially found in %s but problems encountered parsing the results: %s.  Skipping ...",
+                                 toplevel_dir, str(e))
 
             dir_scan_end = time.time()
             logger.info("Scan time for %s: %d seconds", toplevel_dir, (dir_scan_end - dir_scan_start))
@@ -862,7 +882,12 @@ def scan_processes(self):
                 logger.debug("Unable to scan process %s: %s", scan_pid, cpe.output.strip())
                 continue
 
-            self.parse_scan_output(output)
+            try:
+                self.parse_scan_output(output)
+            except Exception as e:
+                self.potential_matches += 1
+                logger.exception("Rule match(es) potentially found in process %s but problems encountered parsing the results: %s.  Skipping ...",
+                                 scan_pid, str(e))
 
             pid_scan_end = time.time()
             logger.info("Scan time for process %s: %d seconds", scan_pid, (pid_scan_end - pid_scan_start))
@@ -969,11 +994,15 @@ def skip_string_data_lines(string_data_lines):
                 rule_match['matches'] = [rule_match_dict]
 
             if self.add_metadata:
-                # Add extra data to each rule match, beyond what yara provides
-                # Eg, for files: line numbers & context, checksums; for processes: process name
-                # TODO: find more pythonic ways of doing this stuff instead of using system commands
-                metadata_func = self._add_file_metadata if source_type == 'file' else self._add_process_metadata
-                metadata_func(rule_match['matches'])
+                try:
+                    # Add extra data to each rule match, beyond what yara provides
+                    # Eg, for files: line numbers & context, checksums; for processes: process name
+                    # TODO: find more pythonic ways of doing this stuff instead of using system commands
+                    metadata_func = self._add_file_metadata if source_type == 'file' else self._add_process_metadata
+                    metadata_func(rule_match['matches'])
+                except Exception as e:
+                    logger.error("Error adding metadata to rule match %s in %s %s: %s.  Skipping ...",
+                                 rule_name, source_type, source, str(e))
 
             self.matches += 1
             logger.info("Matched rule %s in %s %s", rule_name, source_type, source)

diff --git a/insights/client/data_collector.py b/insights/client/data_collector.py
@@ -15,6 +15,7 @@
 from subprocess import Popen, PIPE, STDOUT
 from tempfile import NamedTemporaryFile
 
+from insights.core.blacklist import BLACKLISTED_SPECS
 from insights.util import mangle
 from ..contrib.soscleaner import SOSCleaner
 from .utilities import _expand_paths, get_version_info, systemd_notify_init_thread, get_tags
@@ -132,6 +133,10 @@ def _write_blacklist_report(self, blacklist_report):
         self.archive.add_metadata_to_archive(
             json.dumps(blacklist_report), '/blacklist_report')
 
+        if BLACKLISTED_SPECS:
+            self.archive.add_metadata_to_archive(
+                json.dumps({"specs": BLACKLISTED_SPECS}), '/blacklisted_specs.txt')
+
     def _write_egg_release(self):
         logger.debug("Writing egg release to archive...")
         egg_release = ''
@@ -327,11 +332,13 @@ def run_collection(self, conf, rm_conf, branch_info, blacklist_report):
                     'insights_commands', mangle.mangle_command(c['command']))
             if c['command'] in rm_commands or c.get('symbolic_name') in rm_commands:
                 logger.warn("WARNING: Skipping command %s", c['command'])
+                BLACKLISTED_SPECS.append(c['symbolic_name'])
             elif self.mountpoint == "/" or c.get("image"):
                 cmd_specs = self._parse_command_spec(c, conf['pre_commands'])
                 for s in cmd_specs:
                     if s['command'] in rm_commands:
                         logger.warn("WARNING: Skipping command %s", s['command'])
+                        BLACKLISTED_SPECS.append(s['symbolic_name'])
                         continue
                     cmd_spec = InsightsCommand(self.config, s, self.mountpoint)
                     self.archive.add_to_archive(cmd_spec)
@@ -343,12 +350,14 @@ def run_collection(self, conf, rm_conf, branch_info, blacklist_report):
         for f in conf['files']:
             if f['file'] in rm_files or f.get('symbolic_name') in rm_files:
                 logger.warn("WARNING: Skipping file %s", f['file'])
+                BLACKLISTED_SPECS.append(f['symbolic_name'])
             else:
                 file_specs = self._parse_file_spec(f)
                 for s in file_specs:
                     # filter files post-wildcard parsing
                     if s['file'] in rm_conf.get('files', []):
                         logger.warn("WARNING: Skipping file %s", s['file'])
+                        BLACKLISTED_SPECS.append(s['symbolic_name'])
                     else:
                         file_spec = InsightsFile(s, self.mountpoint)
                         self.archive.add_to_archive(file_spec)
@@ -361,11 +370,13 @@ def run_collection(self, conf, rm_conf, branch_info, blacklist_report):
                 if g.get('symbolic_name') in rm_files:
                     # ignore glob via symbolic name
                     logger.warn("WARNING: Skipping file %s", g['glob'])
+                    BLACKLISTED_SPECS.append(g['symbolic_name'])
                 else:
                     glob_specs = self._parse_glob_spec(g)
                     for g in glob_specs:
                         if g['file'] in rm_files:
                             logger.warn("WARNING: Skipping file %s", g['file'])
+                            BLACKLISTED_SPECS.append(g['symbolic_name'])
                         else:
                             glob_spec = InsightsFile(g, self.mountpoint)
                             self.archive.add_to_archive(glob_spec)

diff --git a/insights/collect.py b/insights/collect.py
@@ -9,6 +9,7 @@
 """
 from __future__ import print_function
 import argparse
+import json
 import logging
 import os
 import sys
@@ -17,11 +18,13 @@
 
 from datetime import datetime
 
-from insights import apply_configs, apply_default_enabled, dr, get_pool
-from insights.core import blacklist, filters
+from insights import apply_configs, apply_default_enabled, get_pool
+from insights.core import blacklist, dr, filters
+from insights.core.blacklist import BLACKLISTED_SPECS
+from insights.core.exceptions import CalledProcessError
 from insights.core.serde import Hydration
 from insights.util import fs
-from insights.util.subproc import call, CalledProcessError
+from insights.util.subproc import call
 
 SAFE_ENV = {
     "PATH": os.path.pathsep.join([
@@ -203,8 +206,8 @@
         - name: insights.components.virtualization.IsBareMetal
           enabled: true
 
-    # needed for the 'pre-check' of the 'ss' spec
-        - name: insights.parsers.lsmod
+    # needed for the 'pre-check' of the 'ss' spec and the 'modinfo_filtered_modules' spec
+        - name: insights.parsers.lsmod.LsMod
           enabled: true
 
     # needed for the 'pre-check' of the 'is_satellite_server' spec
@@ -401,6 +404,7 @@ def collect(manifest=default_manifest, tmp_path=None, compress=False, rm_conf=No
             log.warning('WARNING: Unknown component in blacklist: %s' % component)
         else:
             dr.set_enabled(component, enabled=False)
+            BLACKLISTED_SPECS.append(component.split('.')[-1])
             log.warning('WARNING: Skipping component: %s', component)
 
     to_persist = get_to_persist(client.get("persist", set()))
@@ -437,6 +441,11 @@ def collect(manifest=default_manifest, tmp_path=None, compress=False, rm_conf=No
         broker.add_observer(h.make_persister(to_persist))
         dr.run_all(broker=broker, pool=pool)
 
+    if BLACKLISTED_SPECS:
+        _write_out_blacklisted_specs(output_path)
+        # Delete the list so the specs aren't written again by the client.
+        del BLACKLISTED_SPECS[:]
+
     collect_errors = _parse_broker_exceptions(broker, EXCEPTIONS_TO_REPORT)
 
     if compress:
@@ -472,6 +481,41 @@ def _parse_broker_exceptions(broker, exceptions_to_report):
     return errors
 
 
+def _write_out_blacklisted_specs(output_path):
+    """
+    Write out the blacklisted specs to blacklisted_specs.txt, and create
+    a meta-data file for this file. That way it can be loaded when the
+    archive is processed.
+
+    Args:
+        output_path (str): Path of the output directory.
+    """
+    if os.path.exists(os.path.join(output_path, "meta_data")):
+        output_path_root = os.path.join(output_path, "data")
+    else:
+        output_path_root = output_path
+
+    with open(os.path.join(output_path_root, "blacklisted_specs.txt"), "w") as of:
+        json.dump({"specs": BLACKLISTED_SPECS}, of)
+
+    doc = {
+        "name": "insights.specs.Specs.blacklisted_specs",
+        "exec_time": 0.0,
+        "errors": [],
+        "results": {
+            "type": "insights.core.spec_factory.DatasourceProvider",
+            "object": {
+                "relative_path": "blacklisted_specs.txt"
+            }
+        },
+        "ser_time": 0.0
+    }
+
+    meta_path = os.path.join(os.path.join(output_path, "meta_data"), "insights.specs.Specs.blacklisted_specs")
+    with open(meta_path, "w") as of:
+        json.dump(doc, of)
+
+
 def main():
     # Remove command line args so that they are not parsed by any called modules
     # The main fxn is only invoked as a cli, if calling from another cli then