Skip to content
This repository has been archived by the owner on Jun 5, 2023. It is now read-only.

Commit

Permalink
Merge pull request #546 from NVISO-BE/development
Browse files Browse the repository at this point in the history
Release 0.2.14
  • Loading branch information
daanraman authored Jun 24, 2020
2 parents 3ac9e9f + 567b3ce commit 022525d
Show file tree
Hide file tree
Showing 44 changed files with 3,684 additions and 849 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
# Changelog
## [Version 0.2.14](https://github.com/NVISO-BE/ee-outliers/releases/tag/0.2.14) (June 24, 2020)
### New features
- Provide the option to authenticate to Elasticsearch using username/password in the configuration file
- Add the option to highlight the part that matched the use case for simplequery models
- Support for multiple use-cases in one configuration file
- Add new version of word2vec model
### Minor changes
- Documentation updated
- Make `whitelist_regexp` and `whitelist_literals` sections non-required in the configuration file
- `timestamp_field` default parameter set to "@timestamp"
- Changed default behavior around the use of derived fields (through grok fields). The derived fields are now by default
not added as new fields in case an outlier event is found. To activate it, you have to set the `use_derived_fields` use
case parameter to `1`
- Process documents non-chronologically by default in simplerequest use cases
### Bug fixes
- In case an outlier event is found, avoid creating fields outside the outlier dictionary when `use_derived_fields` is
activated

## [Version 0.2.13](https://github.com/NVISO-BE/ee-outliers/releases/tag/0.2.13) (April 8, 2020)
- Improved documentation (source code and user documentation)
- Fixes an issue where DSL queries in use case configuration files would not be correctly parsed (issue #455)
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.13
0.2.14
773 changes: 395 additions & 378 deletions app/analyzers/ml_models/word2vec.py

Large diffs are not rendered by default.

42 changes: 38 additions & 4 deletions app/analyzers/simplequery.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
from helpers.singletons import es, logging
from helpers.singletons import settings, es, logging
from helpers.analyzer import Analyzer
import re
from configparser import NoSectionError, NoOptionError


class SimplequeryAnalyzer(Analyzer):

def __init__(self, model_name, config_section):
super(SimplequeryAnalyzer, self).__init__("simplequery", model_name, config_section)

def _extract_additional_model_settings(self):
"""
Override method from Analyzer
"""
self.model_settings["highlight_match"] = self.config_section.getboolean("highlight_match")
if self.model_settings["highlight_match"] is None:
try:
self.model_settings["highlight_match"] = settings.config.getboolean("simplequery", "highlight_match")
except (NoSectionError, NoOptionError):
self.model_settings["highlight_match"] = False

def evaluate_model(self):

model_filter = {
Expand Down Expand Up @@ -52,9 +65,30 @@ def evaluate_model(self):
if self.total_events > 0:
for doc in documents:
logging.tick()
fields = es.extract_fields_from_document(
doc, extract_derived_fields=self.model_settings["use_derived_fields"])
outlier = self.create_outlier(fields, doc)
outlier = self._create_outlier(doc)
self.process_outlier(outlier)

self.print_analysis_summary()

def _create_outlier(self, raw_doc):
"""
Create outlier from raw_doc
:param raw_doc: raw document representing one hit event from an Elasticsearch request
:return: the created outlier
"""
extra_outlier_information = dict()
if self.model_settings["highlight_match"]:
extra_outlier_information["matched_fields"] = raw_doc["highlight"]

matched_values = dict()
for key, fields in raw_doc["highlight"].items():
matched_values[key] = list()
for field in fields:
# Find values between tags <value> and </value>
values = re.findall("<value>((.|\n)*?)</value>", field)
matched_values[key] = [value for value, _ in values]
extra_outlier_information["matched_values"] = str(matched_values)
fields = es.extract_fields_from_document(raw_doc,
extract_derived_fields=self.model_settings["use_derived_fields"])
return self.create_outlier(fields, raw_doc, extra_outlier_information=extra_outlier_information)
1,100 changes: 933 additions & 167 deletions app/analyzers/word2vec.py

Large diffs are not rendered by default.

51 changes: 44 additions & 7 deletions app/helpers/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import abc
import dateutil
from dateutil import parser

from configparser import NoOptionError
from configparser import NoOptionError, NoSectionError

from helpers.singletons import settings, es, logging
import helpers.utils
Expand Down Expand Up @@ -66,7 +66,7 @@ def _extract_model_settings(self):

# by default, we don't process documents chronologically when analyzing the model, as it
# has a high impact on performance when scanning in Elasticsearch
model_settings["process_documents_chronologically"] = True
model_settings["process_documents_chronologically"] = False

model_settings["es_query_filter"] = self.config_section.get("es_query_filter")
if model_settings["es_query_filter"]:
Expand Down Expand Up @@ -94,7 +94,7 @@ def _extract_model_settings(self):
except NoOptionError:
model_settings["should_notify"] = False

model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields")
model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields", fallback=False)

model_settings["es_index"] = self.config_section.get("es_index")
if not model_settings["es_index"]:
Expand Down Expand Up @@ -226,7 +226,9 @@ def process_outlier(self, outlier):
if settings.print_outliers_to_console:
logging.logger.debug("%s [whitelisted outlier]", outlier.outlier_dict["summary"])
else:
es.process_outlier(outlier=outlier, should_notify=self.model_settings["should_notify"])
es.process_outlier(outlier=outlier,
should_notify=self.model_settings["should_notify"],
extract_derived_fields=self.model_settings["use_derived_fields"])

def print_analysis_intro(self, event_type, total_events):
"""
Expand Down Expand Up @@ -261,8 +263,8 @@ def get_time_window_info(history_days=None, history_hours=None):
search_range_end = search_range["range"][str(settings.config.get("general", "timestamp_field",
fallback="timestamp"))]["lte"]

search_start_range_printable = dateutil.parser.parse(search_range_start).strftime('%Y-%m-%d %H:%M:%S')
search_end_range_printable = dateutil.parser.parse(search_range_end).strftime('%Y-%m-%d %H:%M:%S')
search_start_range_printable = parser.parse(search_range_start).strftime('%Y-%m-%d %H:%M:%S')
search_end_range_printable = parser.parse(search_range_end).strftime('%Y-%m-%d %H:%M:%S')
return "processing events between " + search_start_range_printable + " and " + search_end_range_printable

@abc.abstractmethod
Expand All @@ -273,3 +275,38 @@ def evaluate_model(self):
detection of outliers.
"""
raise NotImplementedError()

def extract_parameter(self, param_name, param_type=None, default=None):
"""
Extract parameter in general or use-case conf file.
:param param_name: Name of the parameter to extract in general or use-case conf file
:param param_type: Type of conversion of the parameter. string, int, float, boolean or None.
:param default: default value if parameter is not found.
:return: the parameter converted into param_type
"""
config_section_get = {"string": self.config_section.get,
"int": self.config_section.getint,
"float": self.config_section.getfloat,
"boolean": self.config_section.getboolean}

settings_config_get = {"string": settings.config.get,
"int": settings.config.getint,
"float": settings.config.getfloat,
"boolean": settings.config.getboolean}
try:
if param_type is None:
param_value = config_section_get["string"](param_name)
if param_value is None:
param_value = settings_config_get["string"](self.model_type, param_name)
else:
param_value = config_section_get[param_type](param_name)
if param_value is None:
param_value = settings_config_get[param_type](self.model_type, param_name)
except (NoOptionError, NoSectionError) as e:
if default is not None:
param_value = default
else:
raise ValueError(e)

return param_value
48 changes: 33 additions & 15 deletions app/helpers/analyzerfactory.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from helpers.singletons import logging
import configparser
import os
import re
Expand Down Expand Up @@ -35,34 +36,51 @@ def section_to_analyzer(section_name, section):
def create(config_file):
"""
Creates an analyzer based on a configuration file
Deprecated in favor of `create_multi`
:param config_file: configuration file containing a single analyzer
:return: returns the analyzer object
"""
analyzers = AnalyzerFactory.create_multi(config_file)

# File should only contain 1 analyzer
if len(analyzers) != 1:
raise ValueError("Config file must contain exactly one use case (found %d)" % len(analyzers))

analyzer = analyzers[0]

return analyzer

@staticmethod
def create_multi(config_file, configparser_options={}):
"""
Creates a list of analyzers based on a configuration file
:param config_file: configuration file containing one or multiple analyzers
:param configparser_options: Optional parameters to configparser.RawConfigParser(...)
:return: returns the analyzer objects in a list
"""
if not os.path.isfile(config_file):
raise ValueError("Use case file %s does not exist" % config_file)

# Read the ini file from disk
config = configparser.RawConfigParser()
config = configparser.RawConfigParser(**configparser_options)
config.read(config_file)

logging.logger.debug(config)

# Create a list of all analyzers found in the config file
analyzers = [AnalyzerFactory.section_to_analyzer(section_name, section)
for section_name, section in config.items()]
analyzers = list(filter(None, analyzers))

# File should only contain 1 analyzer
if len(analyzers) != 1:
raise ValueError("Config file must contain exactly one use case (found %d)" % len(analyzers))

analyzer = analyzers[0]
for analyzer in analyzers:
if "whitelist_literals" in config.sections():
for _, value in config["whitelist_literals"].items():
analyzer.model_whitelist_literals.append(
set([x.strip() for x in value.split(",")]))

if "whitelist_literals" in config.sections():
for _, value in config["whitelist_literals"].items():
analyzer.model_whitelist_literals.append(set([x.strip() for x in value.split(",")]))
if "whitelist_regexps" in config.sections():
for _, value in config["whitelist_regexps"].items():
analyzer.model_whitelist_regexps.append(
(set([re.compile(x.strip(), re.IGNORECASE) for x in value.split(",")])))

if "whitelist_regexps" in config.sections():
for _, value in config["whitelist_regexps"].items():
analyzer.model_whitelist_regexps.append((set([re.compile(x.strip(), re.IGNORECASE)
for x in value.split(",")])))

return analyzer
return analyzers
Loading

0 comments on commit 022525d

Please sign in to comment.