Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graph Equivalence Changes #491

Merged
merged 19 commits into from
Feb 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 95 additions & 24 deletions stix2/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,10 @@
import copy

from .datastore import CompositeDataSource, DataStoreMixin
from .equivalence.graph import graphically_equivalent
from .equivalence.object import ( # noqa: F401
WEIGHTS, check_property_present, custom_pattern_based, exact_match,
list_reference_check, partial_external_reference_based, partial_list_based,
partial_location_distance, partial_string_based, partial_timestamp_based,
reference_check, semantically_equivalent,
)
from .equivalence.graph import graph_equivalence, graph_similarity
from .equivalence.object import object_equivalence, object_similarity
from .parsing import parse as _parse

# TODO: Remove all unused imports that now belong to the equivalence module in the next major release.
# Kept for backwards compatibility.


class ObjectFactory(object):
"""Easily create STIX objects with default values for certain properties.
Expand Down Expand Up @@ -197,23 +189,62 @@ def creator_of(self, obj):
return None

@staticmethod
def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
"""This method verifies if two objects of the same type are
semantically equivalent.
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
"""This method returns a measure of how similar the two objects are.

Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process

Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.

Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.

Note:
Default weight_dict:

.. include:: ../object_default_sem_eq_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return object_similarity(obj1, obj2, prop_scores, **weight_dict)

@staticmethod
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given
threshold value.

Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both objects equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process

Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
bool: True if the result of the object similarity is greater than or equal to
the threshold value. False otherwise.

Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
Expand All @@ -229,14 +260,14 @@ def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.

"""
return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict)
return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)

@staticmethod
def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
This approach builds on top of the object-based semantic equivalence process
This approach builds on top of the object-based similarity process
and each comparison can return a value between 0 and 100.

Args:
Expand All @@ -245,13 +276,53 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process

Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.

Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.

Note:
Default weight_dict:

.. include:: ../graph_default_sem_eq_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return graph_similarity(ds1, ds2, prop_scores, **weight_dict)

@staticmethod
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.

Args:
ds1: A DataStore object instance representing your graph
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process

Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
bool: True if the result of the graph similarity is greater than or equal to
the threshold value. False otherwise.

Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
Expand All @@ -267,4 +338,4 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.

"""
return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict)
return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict)
2 changes: 1 addition & 1 deletion stix2/equivalence/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Python APIs for STIX 2 Semantic Equivalence.
"""Python APIs for STIX 2 Semantic Equivalence and Similarity.

.. autosummary::
:toctree: equivalence
Expand Down
149 changes: 96 additions & 53 deletions stix2/equivalence/graph/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,63 @@
"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
import logging

from ..object import (
WEIGHTS, exact_match, list_reference_check, partial_string_based,
partial_timestamp_based, reference_check, semantically_equivalent,
WEIGHTS, _bucket_per_type, _object_pairs, exact_match,
list_reference_check, object_similarity, partial_string_based,
partial_timestamp_based, reference_check,
)

logger = logging.getLogger(__name__)


def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.

Args:
ds1: A DataStore object instance representing your graph
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process

Returns:
bool: True if the result of the graph similarity is greater than or equal to
the threshold value. False otherwise.

Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.

Note:
Default weight_dict:

.. include:: ../../graph_default_sem_eq_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
if similarity_result >= threshold:
return True
return False


def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
This approach builds on top of the object-based semantic equivalence process
This approach builds on top of the object-based similarity process
and each comparison can return a value between 0 and 100.

Args:
Expand All @@ -22,20 +66,20 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process

Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
float: A number between 0.0 and 100.0 as a measurement of similarity.

Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.

Note:
Default weights_dict:
Default weight_dict:

.. include:: ../../graph_default_sem_eq_weights.rst

Expand All @@ -44,63 +88,62 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.

"""
results = {}
similarity_score = 0
weights = GRAPH_WEIGHTS.copy()

if weight_dict:
weights.update(weight_dict)

results = {}
depth = weights["_internal"]["max_depth"]

graph1 = ds1.query([])
graph2 = ds2.query([])

graph1.sort(key=lambda x: x["type"])
graph2.sort(key=lambda x: x["type"])

if len(graph1) < len(graph2):
weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2
g1 = graph1
g2 = graph2
else:
weights["_internal"]["ds1"] = ds2
weights["_internal"]["ds2"] = ds1
g1 = graph2
g2 = graph1

for object1 in g1:
for object2 in g2:
if object1["type"] == object2["type"] and object1["type"] in weights:
iprop_score = {}
result = semantically_equivalent(object1, object2, iprop_score, **weights)
objects1_id = object1["id"]
weights["_internal"]["max_depth"] = depth

if objects1_id not in results:
results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
elif result > results[objects1_id]["value"]:
results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}

equivalence_score = 0
if weights["_internal"]["max_depth"] <= 0:
raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")

pairs = _object_pairs(
_bucket_per_type(ds1.query([])),
_bucket_per_type(ds2.query([])),
weights,
)

weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2

logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
for object1, object2 in pairs:
iprop_score = {}
object1_id = object1["id"]
object2_id = object2["id"]

result = object_similarity(object1, object2, iprop_score, **weights)

if object1_id not in results:
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
elif result > results[object1_id]["value"]:
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}

if object2_id not in results:
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}
elif result > results[object2_id]["value"]:
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}

matching_score = sum(x["value"] for x in results.values())
sum_weights = len(results) * 100.0
if sum_weights > 0:
equivalence_score = (matching_score / sum_weights) * 100
len_pairs = len(results)
if len_pairs > 0:
similarity_score = matching_score / len_pairs

prop_scores["matching_score"] = matching_score
prop_scores["sum_weights"] = sum_weights
prop_scores["len_pairs"] = len_pairs
prop_scores["summary"] = results

logger.debug(
"DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
sum_weights,
"DONE\t\tLEN_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
len_pairs,
matching_score,
equivalence_score,
similarity_score,
)
return equivalence_score
return similarity_score


# default weights used for the graph semantic equivalence process
# default weights used for the graph similarity process
GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({
"grouping": {
Expand Down
Loading