From a12b00aa34f1dff92daa2da9e689f33ce9fc990a Mon Sep 17 00:00:00 2001 From: Yehuda Rosenberg <90599084+RosenbergYehuda@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:59:07 +0200 Subject: [PATCH] YR/Remove-fields-with-one-letter-DBotFindSimilarIncidents/XSUP-29299 (#31161) * fixes * code and test * remove Json feed from this pr * test * note * pre commit * RN * CR and Flake8 * format * pre commit --- Packs/Base/ReleaseNotes/1_32_51.md | 6 ++ .../DBotFindSimilarIncidents.py | 82 ++++++++++--------- .../DBotFindSimilarIncidents.yml | 6 +- .../DBotFindSimilarIncidents_test.py | 42 +++++++++- .../DBotFindSimilarIncidents/README.md | 6 +- Packs/Base/pack_metadata.json | 2 +- 6 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 Packs/Base/ReleaseNotes/1_32_51.md diff --git a/Packs/Base/ReleaseNotes/1_32_51.md b/Packs/Base/ReleaseNotes/1_32_51.md new file mode 100644 index 000000000000..0afc9d1630bb --- /dev/null +++ b/Packs/Base/ReleaseNotes/1_32_51.md @@ -0,0 +1,6 @@ + +#### Scripts + +##### DBotFindSimilarIncidents + +Improved implementation to prevent error messages by excluding fields with fewer than two letters from entering the calculation model. diff --git a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.py b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.py index 948475237ab3..87753ed67c65 100644 --- a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.py +++ b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.py @@ -9,7 +9,7 @@ import json import pandas as pd from scipy.spatial.distance import cdist -from typing import Any, List, Dict, Union +from typing import Any warnings.simplefilter("ignore") warnings.filterwarnings('ignore', category=UserWarning) @@ -60,7 +60,7 @@ REPLACE_COMMAND_LINE = {"=": " = ", "\\": "/", "[": "", "]": "", '"': "", "'": "", } -def keep_high_level_field(incidents_field: List[str]) -> List[str]: +def keep_high_level_field(incidents_field: list[str]) -> list[str]: """ Return list of fields if they are in the first level of the argument - xdralert.commandline will return xdralert :param incidents_field: list of incident fields @@ -69,7 +69,7 @@ def keep_high_level_field(incidents_field: List[str]) -> List[str]: return [x.split('.')[0] if '.' in x else x for x in incidents_field] -def wrapped_list(obj: List) -> List: +def wrapped_list(obj: list) -> list: """ Wrapped object into a list if not list :param obj: @@ -80,7 +80,7 @@ def wrapped_list(obj: List) -> List: return obj -def preprocess_incidents_field(incidents_field: str, prefix_to_remove: List[str]) -> str: +def preprocess_incidents_field(incidents_field: str, prefix_to_remove: list[str]) -> str: """ Remove prefixe from incident fields :param incidents_field: field @@ -103,13 +103,13 @@ def check_list_of_dict(obj) -> bool: # type: ignore return bool(obj) and all(isinstance(elem, dict) for elem in obj) # type: ignore -def remove_duplicates(seq: List[str]) -> List[str]: +def remove_duplicates(seq: list[str]) -> list[str]: seen = set() # type: ignore seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] -def recursive_filter(item: Union[List[Dict], Dict], regex_patterns: List, *fieldsToRemove): +def recursive_filter(item: list[dict] | dict, regex_patterns: list, *fieldsToRemove): """ :param item: Dict of list of Dict @@ -157,7 +157,7 @@ def normalize_json(obj) -> str: # type: ignore if isinstance(obj, str): obj = json.loads(obj) if check_list_of_dict(obj): - obj = {k: v for k, v in enumerate(obj)} + obj = dict(enumerate(obj)) if not isinstance(obj, dict): return " " my_dict = recursive_filter(obj, REGEX_DATE_PATTERN, "None", "N/A", None, "") @@ -187,19 +187,19 @@ def normalize_command_line(command: str) -> str: return '' -def fill_nested_fields(incidents_df: pd.DataFrame, incidents: pd.DataFrame, *list_of_field_list: List[str]) -> \ +def fill_nested_fields(incidents_df: pd.DataFrame, incidents: pd.DataFrame, *list_of_field_list: list[str]) -> \ pd.DataFrame: for field_type in list_of_field_list: for field in field_type: if '.' in field: if isinstance(incidents, list): value_list = [wrapped_list(demisto.dt(incident, field)) for incident in incidents] - value_list = [' '.join(set(list(filter(lambda x: x not in ['None', None, 'N/A'], x)))) for x in + value_list = [' '.join(set(filter(lambda x: x not in ['None', None, 'N/A'], x))) for x in value_list] else: value_list = wrapped_list(demisto.dt(incidents, field)) value_list = ' '.join( # type: ignore - set(list(filter(lambda x: x not in ['None', None, 'N/A'], value_list)))) # type: ignore + set(filter(lambda x: x not in ['None', None, 'N/A'], value_list))) # type: ignore incidents_df[field] = value_list return incidents_df @@ -392,15 +392,15 @@ def init_prediction(self, p_incident_to_match, p_incidents_df, p_field_for_comma self.field_for_json = p_field_for_json def predict(self): - self.remove_empty_field() + self.remove_empty_or_short_fields() self.get_score() self.compute_final_score() return self.prepare_for_display(), self.field_for_command_line + self.field_for_potential_exact_match + \ self.field_for_json - def remove_empty_field(self): + def remove_empty_or_short_fields(self): """ - Remove field where value if empty or unusable or does not exist in the incident... + Remove field where value is empty or is shorter than 2 characters or unusable or does not exist in the incident. :return: """ remove_list = [] @@ -410,6 +410,7 @@ def remove_empty_field(self): or (not isinstance(self.incident_to_match[field].values[0], str) and not isinstance( self.incident_to_match[field].values[0], list)) \ or self.incident_to_match[field].values[0] == 'None' \ + or len(self.incident_to_match[field].values[0]) < 2 \ or self.incident_to_match[field].values[0] == 'N/A': remove_list.append(field) self.field_for_command_line = [x for x in self.field_for_command_line if x not in remove_list] @@ -418,6 +419,7 @@ def remove_empty_field(self): for field in self.field_for_potential_exact_match: if field not in self.incident_to_match.columns or not self.incident_to_match[field].values[ 0] or not isinstance(self.incident_to_match[field].values[0], str) or \ + len(self.incident_to_match[field].values[0]) < 2 or \ self.incident_to_match[field].values[0] == 'None' or self.incident_to_match[field].values[ 0] == 'N/A': remove_list.append(field) @@ -426,10 +428,12 @@ def remove_empty_field(self): remove_list = [] for field in self.field_for_json: if field not in self.incident_to_match.columns or not self.incident_to_match[field].values[ - 0] or self.incident_to_match[field].values[0] == 'None' or self.incident_to_match[field].values[ - 0] == 'N/A' or all(not x for x in self.incident_to_match[field].values[0]): + 0] or self.incident_to_match[field].values[0] == 'None' \ + or len(self.incident_to_match[field].values[0]) < 2 \ + or self.incident_to_match[field].values[0] == 'N/A' \ + or all(not x for x in self.incident_to_match[field].values[0]): remove_list.append(field) - self.field_for_json = [x for x in self.field_for_json if x not in remove_list] + self.field_for_json = [x for x in self.field_for_json if x not in remove_list] def get_score(self): """ @@ -479,7 +483,7 @@ def return_clean_date(timestamp: str) -> str: def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: float, show_distance: bool, max_incidents: int, - fields_used: List[str], + fields_used: list[str], aggregate: str, include_indicators_similarity: bool) -> pd.DataFrame: """ Organize data @@ -493,14 +497,14 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f :return: Clean Dataframe """ if 'id' in similar_incidents.columns.tolist(): - similar_incidents[COLUMN_ID] = similar_incidents['id'].apply(lambda _id: "[%s](#/Details/%s)" % (_id, _id)) + similar_incidents[COLUMN_ID] = similar_incidents['id'].apply(lambda _id: f"[{_id}](#/Details/{_id})") if COLUMN_TIME in similar_incidents.columns: similar_incidents[COLUMN_TIME] = similar_incidents[COLUMN_TIME].apply(lambda x: return_clean_date(x)) if aggregate == 'True': agg_fields = [x for x in similar_incidents.columns if x not in FIELDS_NO_AGGREGATION] similar_incidents = similar_incidents.groupby(agg_fields, as_index=False, dropna=False).agg( { - COLUMN_TIME: lambda x: "%s -> %s" % (min(filter(None, x)), max(filter(None, x))) if len(x) > 1 else x, + COLUMN_TIME: lambda x: f"{min(filter(None, x))} -> {max(filter(None, x))}" if len(x) > 1 else x, 'id': lambda x: ' , '.join(x), COLUMN_ID: lambda x: ' , '.join(x), } @@ -510,7 +514,7 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f similar_incidents = similar_incidents[similar_incidents[SIMILARITY_COLUNM_NAME] >= confidence] if show_distance == 'False': col_to_remove = ['similarity %s' % field for field in fields_used] - similar_incidents.drop(col_to_remove, axis=1, inplace=True) + similar_incidents = similar_incidents.drop(col_to_remove, axis=1) if include_indicators_similarity == "True": similar_incidents = similar_incidents.sort_values(by=ORDER_SCORE_WITH_INDICATORS, ascending=False) else: @@ -519,7 +523,7 @@ def prepare_incidents_for_display(similar_incidents: pd.DataFrame, confidence: f return similar_incidents.head(max_incidents) -def get_incident_by_id(incident_id: str, populate_fields: List[str], from_date: str, to_date: str): +def get_incident_by_id(incident_id: str, populate_fields: list[str], from_date: str, to_date: str): """ Get incident acording to incident id :param incident_id: @@ -546,8 +550,8 @@ def get_incident_by_id(incident_id: str, populate_fields: List[str], from_date: return incident[0] -def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[str], populate_fields: List[str], - incident: Dict, from_date: str, to_date: str, +def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: list[str], populate_fields: list[str], + incident: dict, from_date: str, to_date: str, query_sup: str, limit: int): """ Get incidents for a time window and exact match for somes fields @@ -566,7 +570,7 @@ def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[s if exact_match_field not in incident.keys(): msg += "%s \n" % MESSAGE_NO_FIELD % exact_match_field else: - exact_match_fields_list.append('%s: "%s"' % (exact_match_field, incident[exact_match_field])) + exact_match_fields_list.append(f'{exact_match_field}: "{incident[exact_match_field]}"') query = " AND ".join(exact_match_fields_list) query += " AND -id:%s " % incident['id'] if query_sup: @@ -593,7 +597,7 @@ def get_all_incidents_for_time_window_and_exact_match(exact_match_fields: List[s return incidents, msg -def extract_fields_from_args(arg: List[str]) -> List[str]: +def extract_fields_from_args(arg: list[str]) -> list[str]: fields_list = [preprocess_incidents_field(x.strip(), PREFIXES_TO_REMOVE) for x in arg if x] return list(dict.fromkeys(fields_list)) @@ -644,7 +648,7 @@ def get_args(): # type: ignore show_actual_incident, incident_id, include_indicators_similarity -def load_current_incident(incident_id: str, populate_fields: List[str], from_date: str, to_date: str): +def load_current_incident(incident_id: str, populate_fields: list[str], from_date: str, to_date: str): """ Load current incident if incident_id given or load current incident investigated :param incident_id: incident_id @@ -676,7 +680,7 @@ def remove_fields_not_in_incident(*args, incorrect_fields): return [[x for x in field_type if x not in incorrect_fields] for field_type in args] -def get_similar_incidents_by_indicators(args: Dict): +def get_similar_incidents_by_indicators(args: dict): """ Use DBotFindSimilarIncidentsByIndicators automation and return similars incident from the automation :param args: argument for DBotFindSimilarIncidentsByIndicators automation @@ -698,13 +702,13 @@ def get_data_from_indicators_automation(res, TAG_SCRIPT_INDICATORS_VALUE): return None -def dumps_json_field_in_incident(incident: Dict): +def dumps_json_field_in_incident(incident: dict): """ Dumps value that are dict in for incident values :param incident: json representing the incident :return: """ - for field in incident.keys(): + for field in incident: if isinstance(incident[field], dict): incident[field] = json.dumps(incident[field]) incident_df = pd.DataFrame.from_dict(incident, orient='index').T @@ -712,7 +716,7 @@ def dumps_json_field_in_incident(incident: Dict): def return_outputs_summary(confidence: float, number_incident_fetched: int, number_incidents_found: int, - fields_used: List[str], global_msg: str) -> None: + fields_used: list[str], global_msg: str) -> None: """ Return entry for summary of the automation - Give information about the automation run :param confidence: confidence level given by the user @@ -752,8 +756,8 @@ def create_context_for_incidents(similar_incidents=pd.DataFrame()): def return_outputs_similar_incidents(show_actual_incident: bool, current_incident: pd.DataFrame, - similar_incidents: pd.DataFrame, context: Dict, - tag: Union[str, None] = None): + similar_incidents: pd.DataFrame, context: dict, + tag: str | None = None): """ Return entry and context for similar incidents :param show_actual_incident: Boolean if showing the current incident @@ -798,11 +802,11 @@ def return_outputs_similar_incidents(show_actual_incident: bool, current_inciden "EntryContext": {'DBotFindSimilarIncidents': context}, } if tag is not None: - return_entry["Tags"] = ['SimilarIncidents_{}'.format(tag)] + return_entry["Tags"] = [f'SimilarIncidents_{tag}'] demisto.results(return_entry) -def find_incorrect_fields(populate_fields: List[str], incidents_df: pd.DataFrame, global_msg: str): +def find_incorrect_fields(populate_fields: list[str], incidents_df: pd.DataFrame, global_msg: str): """ Check Field that appear in populate_fields but are not in the incidents_df and return message :param populate_fields: List of fields @@ -839,7 +843,7 @@ def return_outputs_similar_incidents_empty(): outputs={'DBotFindSimilarIncidents': create_context_for_incidents()}) -def enriched_with_indicators_similarity(full_args_indicators_script: Dict, similar_incidents: pd.DataFrame): +def enriched_with_indicators_similarity(full_args_indicators_script: dict, similar_incidents: pd.DataFrame): """ Take DataFrame of similar_incidents and args for indicators script and add information about indicators to similar_incidents @@ -860,9 +864,9 @@ def enriched_with_indicators_similarity(full_args_indicators_script: Dict, simil return similar_incidents -def prepare_current_incident(incident_df: pd.DataFrame, display_fields: List[str], similar_text_field: List[str], - similar_json_field: List[str], similar_categorical_field: List[str], - exact_match_fields: List[str]) -> pd.DataFrame: +def prepare_current_incident(incident_df: pd.DataFrame, display_fields: list[str], similar_text_field: list[str], + similar_json_field: list[str], similar_categorical_field: list[str], + exact_match_fields: list[str]) -> pd.DataFrame: """ Prepare current incident for visualization :param incident_df: incident_df @@ -880,7 +884,7 @@ def prepare_current_incident(incident_df: pd.DataFrame, display_fields: List[str if COLUMN_TIME in incident_filter.columns.tolist(): incident_filter[COLUMN_TIME] = incident_filter[COLUMN_TIME].apply(lambda x: return_clean_date(x)) if 'id' in incident_filter.columns.tolist(): - incident_filter[COLUMN_ID] = incident_filter['id'].apply(lambda _id: "[%s](#/Details/%s)" % (_id, _id)) + incident_filter[COLUMN_ID] = incident_filter['id'].apply(lambda _id: f"[{_id}](#/Details/{_id})") return incident_filter diff --git a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml index 7d3711089573..4bd0ae829cf8 100644 --- a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml +++ b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml @@ -2,13 +2,13 @@ args: - description: Incident ID to get the prediction of. If empty, predicts the the current incident ID. name: incidentId - auto: PREDEFINED - description: 'Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL.' + description: 'Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.' name: similarTextField - auto: PREDEFINED - description: 'Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL.' + description: 'Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.' name: similarCategoricalField - auto: PREDEFINED - description: 'Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields.' + description: 'Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters.' name: similarJsonField - auto: PREDEFINED description: Comma-separated list of additional incident fields to display, but which will not be taken into account when computing similarity. diff --git a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents_test.py b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents_test.py index 0a0a327c7b06..57a7b17751a3 100644 --- a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents_test.py +++ b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents_test.py @@ -3,8 +3,9 @@ preprocess_incidents_field, PREFIXES_TO_REMOVE, check_list_of_dict, REGEX_IP, match_one_regex, \ SIMILARITY_COLUNM_NAME_INDICATOR, SIMILARITY_COLUNM_NAME, euclidian_similarity_capped, find_incorrect_fields, \ MESSAGE_NO_INCIDENT_FETCHED, MESSAGE_INCORRECT_FIELD, MESSAGE_WARNING_TRUNCATED, COLUMN_ID, COLUMN_TIME, \ - TAG_SCRIPT_INDICATORS + TAG_SCRIPT_INDICATORS, Model +import pytest import json import numpy as np import pandas as pd @@ -344,3 +345,42 @@ def test_build_message_of_values(): baz = ['baz1', 'baz2'] assert build_message_of_values([foo, bar, baz]) == "foo_value; bar_value; ['baz1', 'baz2']" + + +@pytest.fixture +def sample_data(): + # Create sample data for testing + data = {'created': ["2019-02-20T15:47:23.962164+02:00"], + 'Name': ["t"], + 'Id': [["123"]], + 'test': [None], + 'xdralerts': ['N/A'], + "test2": [""]} + return pd.DataFrame(data) + + +fields_to_match = ['created', 'Name', 'test', 'Id', 'test2', 'xdralerts'] +expected_results = ['created'] + + +def test_remove_empty_or_short_fields(sample_data): + """ + Given: + - sample_data: a dataframe with a column of strings + When: + - calling remove_empty_or_short_fields function + Then: + - assert that the function removes empty or short or None or 'N/A' or list objects fields + """ + # Create an instance of Model + my_instance = Model({}) + my_instance.incident_to_match = sample_data + + my_instance.field_for_command_line = fields_to_match + my_instance.field_for_potential_exact_match = fields_to_match + my_instance.field_for_json = fields_to_match + + my_instance.remove_empty_or_short_fields() + assert my_instance.field_for_command_line == expected_results + assert my_instance.field_for_potential_exact_match == expected_results + assert my_instance.field_for_json == expected_results diff --git a/Packs/Base/Scripts/DBotFindSimilarIncidents/README.md b/Packs/Base/Scripts/DBotFindSimilarIncidents/README.md index 5ac5f7465982..0db0c041e307 100644 --- a/Packs/Base/Scripts/DBotFindSimilarIncidents/README.md +++ b/Packs/Base/Scripts/DBotFindSimilarIncidents/README.md @@ -25,9 +25,9 @@ This script is used in the following playbooks and scripts. | **Argument Name** | **Description** | | --- | --- | | incidentId | Incident ID to get the prediction of. If empty, predicts the the current incident ID. | -| similarTextField | Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL | -| similarCategoricalField | Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL | -| similarJsonField | Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields | +| similarTextField | Comma-separated list of incident text fields to take into account when computing similarity. For example: commandline, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters. | +| similarCategoricalField | Comma-separated list of incident categorical fields to take into account whe computing similarity. For example: IP, URL. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters. | +| similarJsonField | Comma-separated list of incident JSON fields to take into account whe computing similarity. For example: CustomFields. Note: In order to calculate similarity, fields must consist of a minimum of 2 letters. | | fieldsToDisplay | Comma-separated list of additional incident fields to display, but which will not be taken into account when computing similarity. | | fieldExactMatch | Comma-separated list of incident fields that have to be equal to the current incident fields. This helps reduce the query size. | | useAllFields | Whether to use a predefined set of fields and custom fields to compute similarity. If "True", it will ignore values in similarTextField, similarCategoricalField, similarJsonField. | diff --git a/Packs/Base/pack_metadata.json b/Packs/Base/pack_metadata.json index 643ecdfa5f0a..076639dff662 100644 --- a/Packs/Base/pack_metadata.json +++ b/Packs/Base/pack_metadata.json @@ -2,7 +2,7 @@ "name": "Base", "description": "The base pack for Cortex XSOAR.", "support": "xsoar", - "currentVersion": "1.32.50", + "currentVersion": "1.32.51", "author": "Cortex XSOAR", "serverMinVersion": "6.0.0", "url": "https://www.paloaltonetworks.com/cortex",