Merge pull request #546 from NVISO-BE/development

Release 0.2.14
NVISOsecurity · Jun 24, 2020 · 022525d · 022525d
2 parents 3ac9e9f + 567b3ce
commit 022525d
Show file tree

Hide file tree

Showing 44 changed files with 3,684 additions and 849 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,22 @@
 # Changelog
+## [Version 0.2.14](https://github.com/NVISO-BE/ee-outliers/releases/tag/0.2.14) (June 24, 2020)
+### New features
+- Provide the option to authenticate to Elasticsearch using username/password in the configuration file
+- Add the option to highlight the part that matched the use case for simplequery models
+- Support for multiple use-cases in one configuration file
+- Add new version of word2vec model
+### Minor changes
+- Documentation updated
+- Make `whitelist_regexp` and `whitelist_literals` sections non-required in the configuration file
+- `timestamp_field` default parameter set to "@timestamp"
+- Changed default behavior around the use of derived fields (through grok fields). The derived fields are now by default
+ not added as new fields in case an outlier event is found. To activate it, you have to set the `use_derived_fields` use 
+ case parameter to `1`
+- Process documents non-chronologically by default in simplerequest use cases
+### Bug fixes
+- In case an outlier event is found, avoid creating fields outside the outlier dictionary when `use_derived_fields` is 
+activated
+
 ## [Version 0.2.13](https://github.com/NVISO-BE/ee-outliers/releases/tag/0.2.13) (April 8, 2020)
 - Improved documentation (source code and user documentation)
 - Fixes an issue where DSL queries in use case configuration files would not be correctly parsed (issue #455)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.2.13
+0.2.14
diff --git a/app/analyzers/ml_models/word2vec.py b/app/analyzers/ml_models/word2vec.py
diff --git a/app/analyzers/simplequery.py b/app/analyzers/simplequery.py
@@ -1,12 +1,25 @@
-from helpers.singletons import es, logging
+from helpers.singletons import settings, es, logging
 from helpers.analyzer import Analyzer
+import re
+from configparser import NoSectionError, NoOptionError
 
 
 class SimplequeryAnalyzer(Analyzer):
 
     def __init__(self, model_name, config_section):
         super(SimplequeryAnalyzer, self).__init__("simplequery", model_name, config_section)
 
+    def _extract_additional_model_settings(self):
+        """
+        Override method from Analyzer
+        """
+        self.model_settings["highlight_match"] = self.config_section.getboolean("highlight_match")
+        if self.model_settings["highlight_match"] is None:
+            try:
+                self.model_settings["highlight_match"] = settings.config.getboolean("simplequery", "highlight_match")
+            except (NoSectionError, NoOptionError):
+                self.model_settings["highlight_match"] = False
+
     def evaluate_model(self):
 
         model_filter = {
@@ -52,9 +65,30 @@ def evaluate_model(self):
         if self.total_events > 0:
             for doc in documents:
                 logging.tick()
-                fields = es.extract_fields_from_document(
-                                                doc, extract_derived_fields=self.model_settings["use_derived_fields"])
-                outlier = self.create_outlier(fields, doc)
+                outlier = self._create_outlier(doc)
                 self.process_outlier(outlier)
 
         self.print_analysis_summary()
+
+    def _create_outlier(self, raw_doc):
+        """
+        Create outlier from raw_doc
+
+        :param raw_doc: raw document representing one hit event from an Elasticsearch request
+        :return: the created outlier
+        """
+        extra_outlier_information = dict()
+        if self.model_settings["highlight_match"]:
+            extra_outlier_information["matched_fields"] = raw_doc["highlight"]
+
+            matched_values = dict()
+            for key, fields in raw_doc["highlight"].items():
+                matched_values[key] = list()
+                for field in fields:
+                    # Find values between tags <value> and </value>
+                    values = re.findall("<value>((.|\n)*?)</value>", field)
+                    matched_values[key] = [value for value, _ in values]
+            extra_outlier_information["matched_values"] = str(matched_values)
+        fields = es.extract_fields_from_document(raw_doc,
+                                                 extract_derived_fields=self.model_settings["use_derived_fields"])
+        return self.create_outlier(fields, raw_doc, extra_outlier_information=extra_outlier_information)
diff --git a/app/analyzers/word2vec.py b/app/analyzers/word2vec.py
diff --git a/app/helpers/analyzer.py b/app/helpers/analyzer.py
@@ -1,7 +1,7 @@
 import abc
-import dateutil
+from dateutil import parser
 
-from configparser import NoOptionError
+from configparser import NoOptionError, NoSectionError
 
 from helpers.singletons import settings, es, logging
 import helpers.utils
@@ -66,7 +66,7 @@ def _extract_model_settings(self):
 
         # by default, we don't process documents chronologically when analyzing the model, as it
         # has a high impact on performance when scanning in Elasticsearch
-        model_settings["process_documents_chronologically"] = True
+        model_settings["process_documents_chronologically"] = False
 
         model_settings["es_query_filter"] = self.config_section.get("es_query_filter")
         if model_settings["es_query_filter"]:
@@ -94,7 +94,7 @@ def _extract_model_settings(self):
         except NoOptionError:
             model_settings["should_notify"] = False
 
-        model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields")
+        model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields", fallback=False)
 
         model_settings["es_index"] = self.config_section.get("es_index")
         if not model_settings["es_index"]:
@@ -226,7 +226,9 @@ def process_outlier(self, outlier):
             if settings.print_outliers_to_console:
                 logging.logger.debug("%s [whitelisted outlier]", outlier.outlier_dict["summary"])
         else:
-            es.process_outlier(outlier=outlier, should_notify=self.model_settings["should_notify"])
+            es.process_outlier(outlier=outlier,
+                               should_notify=self.model_settings["should_notify"],
+                               extract_derived_fields=self.model_settings["use_derived_fields"])
 
     def print_analysis_intro(self, event_type, total_events):
         """
@@ -261,8 +263,8 @@ def get_time_window_info(history_days=None, history_hours=None):
         search_range_end = search_range["range"][str(settings.config.get("general", "timestamp_field",
                                                                          fallback="timestamp"))]["lte"]
 
-        search_start_range_printable = dateutil.parser.parse(search_range_start).strftime('%Y-%m-%d %H:%M:%S')
-        search_end_range_printable = dateutil.parser.parse(search_range_end).strftime('%Y-%m-%d %H:%M:%S')
+        search_start_range_printable = parser.parse(search_range_start).strftime('%Y-%m-%d %H:%M:%S')
+        search_end_range_printable = parser.parse(search_range_end).strftime('%Y-%m-%d %H:%M:%S')
         return "processing events between " + search_start_range_printable + " and " + search_end_range_printable
 
     @abc.abstractmethod
@@ -273,3 +275,38 @@ def evaluate_model(self):
         detection of outliers.
         """
         raise NotImplementedError()
+
+    def extract_parameter(self, param_name, param_type=None, default=None):
+        """
+        Extract parameter in general or use-case conf file.
+
+        :param param_name: Name of the parameter to extract in general or use-case conf file
+        :param param_type: Type of conversion of the parameter. string, int, float, boolean or None.
+        :param default: default value if parameter is not found.
+        :return: the parameter converted into param_type
+        """
+        config_section_get = {"string": self.config_section.get,
+                              "int": self.config_section.getint,
+                              "float": self.config_section.getfloat,
+                              "boolean": self.config_section.getboolean}
+
+        settings_config_get = {"string": settings.config.get,
+                               "int": settings.config.getint,
+                               "float": settings.config.getfloat,
+                               "boolean": settings.config.getboolean}
+        try:
+            if param_type is None:
+                param_value = config_section_get["string"](param_name)
+                if param_value is None:
+                    param_value = settings_config_get["string"](self.model_type, param_name)
+            else:
+                param_value = config_section_get[param_type](param_name)
+                if param_value is None:
+                    param_value = settings_config_get[param_type](self.model_type, param_name)
+        except (NoOptionError, NoSectionError) as e:
+            if default is not None:
+                param_value = default
+            else:
+                raise ValueError(e)
+
+        return param_value
diff --git a/app/helpers/analyzerfactory.py b/app/helpers/analyzerfactory.py
@@ -1,3 +1,4 @@
+from helpers.singletons import logging
 import configparser
 import os
 import re
@@ -35,34 +36,51 @@ def section_to_analyzer(section_name, section):
     def create(config_file):
         """
         Creates an analyzer based on a configuration file
+        Deprecated in favor of `create_multi`
         :param config_file: configuration file containing a single analyzer
         :return: returns the analyzer object
         """
+        analyzers = AnalyzerFactory.create_multi(config_file)
+
+        # File should only contain 1 analyzer
+        if len(analyzers) != 1:
+            raise ValueError("Config file must contain exactly one use case (found %d)" % len(analyzers))
+
+        analyzer = analyzers[0]
+
+        return analyzer
+
+    @staticmethod
+    def create_multi(config_file, configparser_options={}):
+        """
+        Creates a list of analyzers based on a configuration file
+        :param config_file: configuration file containing one or multiple analyzers
+        :param configparser_options: Optional parameters to configparser.RawConfigParser(...)
+        :return: returns the analyzer objects in a list
+        """
         if not os.path.isfile(config_file):
             raise ValueError("Use case file %s does not exist" % config_file)
 
         # Read the ini file from disk
-        config = configparser.RawConfigParser()
+        config = configparser.RawConfigParser(**configparser_options)
         config.read(config_file)
 
+        logging.logger.debug(config)
+
         # Create a list of all analyzers found in the config file
         analyzers = [AnalyzerFactory.section_to_analyzer(section_name, section)
                      for section_name, section in config.items()]
         analyzers = list(filter(None, analyzers))
 
-        # File should only contain 1 analyzer
-        if len(analyzers) != 1:
-            raise ValueError("Config file must contain exactly one use case (found %d)" % len(analyzers))
-
-        analyzer = analyzers[0]
+        for analyzer in analyzers:
+            if "whitelist_literals" in config.sections():
+                for _, value in config["whitelist_literals"].items():
+                    analyzer.model_whitelist_literals.append(
+                        set([x.strip() for x in value.split(",")]))
 
-        if "whitelist_literals" in config.sections():
-            for _, value in config["whitelist_literals"].items():
-                analyzer.model_whitelist_literals.append(set([x.strip() for x in value.split(",")]))
+            if "whitelist_regexps" in config.sections():
+                for _, value in config["whitelist_regexps"].items():
+                    analyzer.model_whitelist_regexps.append(
+                        (set([re.compile(x.strip(), re.IGNORECASE) for x in value.split(",")])))
 
-        if "whitelist_regexps" in config.sections():
-            for _, value in config["whitelist_regexps"].items():
-                analyzer.model_whitelist_regexps.append((set([re.compile(x.strip(), re.IGNORECASE)
-                                                              for x in value.split(",")])))
-
-        return analyzer
+        return analyzers