Skip to content
This repository has been archived by the owner on Jun 5, 2023. It is now read-only.

Commit

Permalink
Merge pull request #536 from NVISO-BE/development
Browse files Browse the repository at this point in the history
Merge development into dev-ml
  • Loading branch information
maximilienroberti authored Jun 22, 2020
2 parents 64a0d56 + 1c82662 commit 18975fd
Show file tree
Hide file tree
Showing 11 changed files with 83 additions and 43 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog
## Unreleased
- Changed default behavior around the use of derived fields (through grok fields). The derived fields are now by default
not added as new fields in case an outlier event is found. To activated it, you have to set
the `use_derived_fields` use case parameter to `1`.
- Add new version of word2vec analyzer
- Add the option to highlight the part that matched the use case for simplequery models
- Support for multiple use-cases in one configuration file
Expand Down
2 changes: 1 addition & 1 deletion app/analyzers/simplequery.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def _extract_additional_model_settings(self):
self.model_settings["highlight_match"] = self.config_section.getboolean("highlight_match")
if self.model_settings["highlight_match"] is None:
try:
self.model_settings["highlight_match"] = settings.config.get("simplequery", "highlight_match")
self.model_settings["highlight_match"] = settings.config.getboolean("simplequery", "highlight_match")
except (NoSectionError, NoOptionError):
self.model_settings["highlight_match"] = False

Expand Down
2 changes: 0 additions & 2 deletions app/analyzers/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ def _extract_additional_model_settings(self):

self.model_settings["aggregator"] = self.config_section["aggregator"].replace(' ', '').split(",")

self.model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields")

# word2vec_batch_eval_size parameter
self.model_settings["word2vec_batch_eval_size"] = self.extract_parameter("word2vec_batch_eval_size",
param_type="int")
Expand Down
8 changes: 5 additions & 3 deletions app/helpers/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _extract_model_settings(self):

# by default, we don't process documents chronologically when analyzing the model, as it
# has a high impact on performance when scanning in Elasticsearch
model_settings["process_documents_chronologically"] = True
model_settings["process_documents_chronologically"] = False

model_settings["es_query_filter"] = self.config_section.get("es_query_filter")
if model_settings["es_query_filter"]:
Expand Down Expand Up @@ -94,7 +94,7 @@ def _extract_model_settings(self):
except NoOptionError:
model_settings["should_notify"] = False

model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields")
model_settings["use_derived_fields"] = self.config_section.getboolean("use_derived_fields", fallback=False)

model_settings["es_index"] = self.config_section.get("es_index")
if not model_settings["es_index"]:
Expand Down Expand Up @@ -226,7 +226,9 @@ def process_outlier(self, outlier):
if settings.print_outliers_to_console:
logging.logger.debug("%s [whitelisted outlier]", outlier.outlier_dict["summary"])
else:
es.process_outlier(outlier=outlier, should_notify=self.model_settings["should_notify"])
es.process_outlier(outlier=outlier,
should_notify=self.model_settings["should_notify"],
extract_derived_fields=self.model_settings["use_derived_fields"])

def print_analysis_intro(self, event_type, total_events):
"""
Expand Down
21 changes: 13 additions & 8 deletions app/helpers/es.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,16 +321,17 @@ def remove_all_outliers(self):
else:
self.logging.logger.info("no existing outliers were found, so nothing was wiped")

def process_outlier(self, outlier=None, should_notify=False):
def process_outlier(self, outlier=None, should_notify=False, extract_derived_fields=False):
"""
Save outlier (if configuration is setup for that), notify (also depending of configuration) and print.
:param outlier: the detected outlier
:param should_notify: True if notification need to be send
:param extract_derived_fields: True to save derived fields
"""

if self.settings.es_save_results:
self.save_outlier(outlier=outlier)
self.save_outlier(outlier=outlier, extract_derived_fields=extract_derived_fields)

if should_notify:
self.notifier.notify_on_outlier(outlier=outlier)
Expand Down Expand Up @@ -399,16 +400,20 @@ def flush_bulk_actions(self, refresh=False):
eshelpers.bulk(self.conn, self.bulk_actions, stats_only=True, refresh=refresh)
self.bulk_actions = []

def save_outlier(self, outlier=None):
def save_outlier(self, outlier=None, extract_derived_fields=False):
"""
Complete (with derived fields) and save outlier to Elasticsearch (via bulk action)
:param outlier: the outlier that need to be save
:param extract_derived_fields: True to save derived fields
"""
# add the derived fields as outlier observations
derived_fields = self.extract_derived_fields(outlier.doc["_source"])
for derived_field, derived_value in derived_fields.items():
outlier.outlier_dict["derived_" + derived_field] = derived_value
if extract_derived_fields:
# add the derived fields as outlier observations
derived_fields = self.extract_derived_fields(outlier.doc["_source"])
for derived_field, derived_value in derived_fields.items():
outlier.outlier_dict["derived_" + derived_field] = derived_value
# delete temporary derived fields
del outlier.doc["_source"][derived_field]

doc = add_outlier_to_document(outlier)
self.add_update_bulk_action(doc)
Expand Down Expand Up @@ -497,7 +502,7 @@ def _get_highlight_settings(model_settings):
:return: highlight_settings: Highlight settings
"""
highlight_settings = None
if model_settings["highlight_match"]:
if "highlight_match" in model_settings and model_settings["highlight_match"]:
highlight_settings = dict()
# Pre and post tag definition
highlight_settings["pre_tags"] = ["<value>"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,7 @@
"outlier"
],
"timestamp": "2018-07-02T07:54:30.549523+00:00",
"type": "eagleeye",
"timestamp_year": "2018",
"timestamp_month": "07",
"timestamp_day": "02",
"timestamp_hour": "07",
"timestamp_minute": "54",
"timestamp_second": "30.549523",
"timestamp_timezone": "+00:00"
"type": "eagleeye"
},
"_type": "doc",
"sort": [
Expand Down
14 changes: 2 additions & 12 deletions app/tests/unit_tests/test_analyzer_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,6 @@ def test_metrics_small_batch_last_outlier(self):

self.assertEqual(analyzer.total_outliers, 1)

def test_metrics_use_derived_fields_in_doc(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document())

self.test_settings.change_configuration_path("/app/tests/unit_tests/files/metrics_test_01.conf")
analyzer = AnalyzerFactory.create("/app/tests/unit_tests/files/use_cases/metrics/metrics_dummy_test_derived.conf")
analyzer.evaluate_model()

result = [elem for elem in es._scan()][0]
self.assertTrue("timestamp_year" in result['_source'])

def test_metrics_use_derived_fields_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
Expand All @@ -177,7 +167,7 @@ def test_metrics_not_use_derived_fields_in_doc(self):
result = [elem for elem in es._scan()][0]
self.assertFalse("timestamp_year" in result['_source'])

def test_metrics_not_use_derived_fields_but_present_in_outlier(self):
def test_metrics_not_use_derived_fields_not_present_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document({"user_id": 11}))

Expand All @@ -187,7 +177,7 @@ def test_metrics_not_use_derived_fields_but_present_in_outlier(self):

result = [elem for elem in es._scan()][0]
# The parameter use_derived_fields haven't any impact on outliers keys
self.assertTrue("derived_timestamp_year" in result['_source']['outliers'])
self.assertFalse("derived_timestamp_year" in result['_source']['outliers'])

def test_whitelist_batch_document_not_process_all(self):
self.test_settings.change_configuration_path("/app/tests/unit_tests/files/metrics_test_with_whitelist.conf")
Expand Down
8 changes: 4 additions & 4 deletions app/tests/unit_tests/test_analyzer_simplequery.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def test_simplequry_use_matched_values_in_outlier(self):
result = [elem for elem in es._scan()][0]
self.assertTrue("matched_values" in result['_source']['outliers'])

def test_simplequery_use_derived_fields_in_doc(self):
def test_simplequery_not_use_derived_fields_in_doc(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document())

Expand All @@ -228,7 +228,7 @@ def test_simplequery_use_derived_fields_in_doc(self):
analyzer.evaluate_model()

result = [elem for elem in es._scan()][0]
self.assertTrue("timestamp_year" in result['_source'])
self.assertFalse("timestamp_year" in result['_source'])

def test_simplequery_use_derived_fields_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
Expand All @@ -252,7 +252,7 @@ def test_simplequery_not_use_derived_fields_in_doc(self):
result = [elem for elem in es._scan()][0]
self.assertFalse("timestamp_year" in result['_source'])

def test_simplequery_not_use_derived_fields_but_present_in_outlier(self):
def test_simplequery_not_use_derived_fields_not_present_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document())

Expand All @@ -261,7 +261,7 @@ def test_simplequery_not_use_derived_fields_but_present_in_outlier(self):
analyzer.evaluate_model()

result = [elem for elem in es._scan()][0]
self.assertTrue("derived_timestamp_year" in result['_source']['outliers'])
self.assertFalse("derived_timestamp_year" in result['_source']['outliers'])

def test_simplequery_default_outlier_infos(self):
dummy_doc_generate = DummyDocumentsGenerate()
Expand Down
8 changes: 4 additions & 4 deletions app/tests/unit_tests/test_analyzer_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def test_terms_generated_document_coeff_of_variation_respect_min(self):

self.assertEqual(nbr_outliers, len(all_doc))

def test_terms_use_derived_fields_in_doc(self):
def test_terms_use_derived_fields_check_not_in_doc(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document())

Expand All @@ -267,7 +267,7 @@ def test_terms_use_derived_fields_in_doc(self):
analyzer.evaluate_model()

result = [elem for elem in es._scan()][0]
self.assertTrue("timestamp_year" in result['_source'])
self.assertFalse("timestamp_year" in result['_source'])

def test_terms_use_derived_fields_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
Expand All @@ -291,7 +291,7 @@ def test_terms_not_use_derived_fields_in_doc(self):
result = [elem for elem in es._scan()][0]
self.assertFalse("timestamp_year" in result['_source'])

def test_terms_not_use_derived_fields_but_present_in_outlier(self):
def test_terms_not_use_derived_fields_not_present_in_outlier(self):
dummy_doc_generate = DummyDocumentsGenerate()
self.test_es.add_doc(dummy_doc_generate.generate_document({"user_id": 11}))

Expand All @@ -301,7 +301,7 @@ def test_terms_not_use_derived_fields_but_present_in_outlier(self):

result = [elem for elem in es._scan()][0]
# The parameter use_derived_fields haven't any impact on outliers keys
self.assertTrue("derived_timestamp_year" in result['_source']['outliers'])
self.assertFalse("derived_timestamp_year" in result['_source']['outliers'])

def test_terms_default_outlier_infos(self):
dummy_doc_generate = DummyDocumentsGenerate()
Expand Down
47 changes: 47 additions & 0 deletions app/tests/unit_tests/test_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
from helpers.analyzerfactory import AnalyzerFactory

test_file_whitelist_path_config = "/app/tests/unit_tests/files/whitelist_tests_01_with_general.conf"
config_file_path = "/app/tests/unit_tests/files/"
config_file_simplequery_test_01 = config_file_path + "simplequery_test_01.conf"

helpers.analyzerfactory.CLASS_MAPPING["analyzer"] = TestStubAnalyzer


class TestEs(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -75,3 +78,47 @@ def test_remove_all_whitelisted_outliers(self):
# Check that outlier is correctly remove
result = [doc for doc in es._scan()][0]
self.assertFalse("outliers" in result["_source"])

def test_get_highlight_settings_with_metrics_analyzer(self):
self.test_settings.change_configuration_path("/app/tests/unit_tests/files/metrics_test_01.conf")
analyzer = AnalyzerFactory.create("/app/tests/unit_tests/files/use_cases/metrics/metrics_dummy_test.conf")
highlight_settings = es._get_highlight_settings(analyzer.model_settings)
self.assertTrue(highlight_settings is None)

def test_get_highlight_settings_with_terms_analyzer(self):
self.test_settings.change_configuration_path("/app/tests/unit_tests/files/terms_test_01.conf")
analyzer = AnalyzerFactory.create("/app/tests/unit_tests/files/use_cases/terms/terms_dummy_test.conf")
highlight_settings = es._get_highlight_settings(analyzer.model_settings)
self.assertTrue(highlight_settings is None)

def test_get_highlight_settings_with_simplequery_analyzer_and_highlight_match_activated(self):
self.test_settings.change_configuration_path(config_file_simplequery_test_01)
use_case_file = "/app/tests/unit_tests/files/use_cases/simplequery/" \
"simplequery_dummy_test_highlight_match_activated.conf"
analyzer = AnalyzerFactory.create(use_case_file)
highlight_settings = es._get_highlight_settings(analyzer.model_settings)
highlight_settings_test = dict()

highlight_settings_test["pre_tags"] = ["<value>"]
highlight_settings_test["post_tags"] = ["</value>"]
highlight_settings_test["fields"] = dict()
highlight_settings_test["fields"]["*"] = dict()

self.assertTrue(highlight_settings == highlight_settings_test)

def test_get_highlight_settings_with_simplequery_analyzer_and_highlight_match_unactivated(self):
self.test_settings.change_configuration_path(config_file_simplequery_test_01)
use_case_file = "/app/tests/unit_tests/files/use_cases/simplequery/" \
"simplequery_dummy_test_highlight_match_unactivated.conf"
analyzer = AnalyzerFactory.create(use_case_file)
highlight_settings = es._get_highlight_settings(analyzer.model_settings)

self.assertTrue(highlight_settings is None)

def test_get_highlight_settings_with_simplequery_analyzer_without_highlight_parameter(self):
self.test_settings.change_configuration_path(config_file_simplequery_test_01)
use_case_file = "/app/tests/unit_tests/files/use_cases/simplequery/simplequery_dummy_test.conf"
analyzer = AnalyzerFactory.create(use_case_file)
highlight_settings = es._get_highlight_settings(analyzer.model_settings)

self.assertTrue(highlight_settings is None)
4 changes: 3 additions & 1 deletion app/tests/unit_tests/test_test_stub_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,15 @@ def test_add_doc_same_id_raise_error(self):
def test_flush_bulk_actions_using_one_save_outlier(self):
doc_with_outlier_with_derived_timestamp = copy.deepcopy(doc_with_outlier_with_derived_timestamp_test_file)
doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file)
doc_without_outlier["_source"] = es.extract_fields_from_document(doc_without_outlier,
extract_derived_fields=True)
self.test_es.add_doc(doc_without_outlier)

test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason",
outlier_summary="dummy summary", doc=doc_without_outlier)
test_outlier.outlier_dict["observation"] = "dummy observation"

es.save_outlier(test_outlier)
es.save_outlier(test_outlier, extract_derived_fields=True)
result = [elem for elem in es._scan()][0]
self.assertEqual(result, doc_with_outlier_with_derived_timestamp)

Expand Down

0 comments on commit 18975fd

Please sign in to comment.