Skip to content

Commit

Permalink
Merge pull request #491 from emmanvg/graph-eq-changes
Browse files Browse the repository at this point in the history
Graph Equivalence Changes
  • Loading branch information
clenk authored Feb 19, 2021
2 parents f9ca684 + 9945377 commit 64608e7
Show file tree
Hide file tree
Showing 6 changed files with 995 additions and 240 deletions.
119 changes: 95 additions & 24 deletions stix2/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,10 @@
import copy

from .datastore import CompositeDataSource, DataStoreMixin
from .equivalence.graph import graphically_equivalent
from .equivalence.object import ( # noqa: F401
WEIGHTS, check_property_present, custom_pattern_based, exact_match,
list_reference_check, partial_external_reference_based, partial_list_based,
partial_location_distance, partial_string_based, partial_timestamp_based,
reference_check, semantically_equivalent,
)
from .equivalence.graph import graph_equivalence, graph_similarity
from .equivalence.object import object_equivalence, object_similarity
from .parsing import parse as _parse

# TODO: Remove all unused imports that now belong to the equivalence module in the next major release.
# Kept for backwards compatibility.


class ObjectFactory(object):
"""Easily create STIX objects with default values for certain properties.
Expand Down Expand Up @@ -197,23 +189,62 @@ def creator_of(self, obj):
return None

@staticmethod
def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
"""This method verifies if two objects of the same type are
semantically equivalent.
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
"""This method returns a measure of how similar the two objects are.
Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process
Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weight_dict:
.. include:: ../object_default_sem_eq_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return object_similarity(obj1, obj2, prop_scores, **weight_dict)

@staticmethod
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given
threshold value.
Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both objects equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process
Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
bool: True if the result of the object similarity is greater than or equal to
the threshold value. False otherwise.
Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
Expand All @@ -229,14 +260,14 @@ def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.
"""
return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict)
return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)

@staticmethod
def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
This approach builds on top of the object-based semantic equivalence process
This approach builds on top of the object-based similarity process
and each comparison can return a value between 0 and 100.
Args:
Expand All @@ -245,13 +276,53 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process
Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weight_dict:
.. include:: ../graph_default_sem_eq_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return graph_similarity(ds1, ds2, prop_scores, **weight_dict)

@staticmethod
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.
Args:
ds1: A DataStore object instance representing your graph
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
bool: True if the result of the graph similarity is greater than or equal to
the threshold value. False otherwise.
Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
Expand All @@ -267,4 +338,4 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.
"""
return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict)
return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict)
2 changes: 1 addition & 1 deletion stix2/equivalence/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Python APIs for STIX 2 Semantic Equivalence.
"""Python APIs for STIX 2 Semantic Equivalence and Similarity.
.. autosummary::
:toctree: equivalence
Expand Down
149 changes: 96 additions & 53 deletions stix2/equivalence/graph/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,63 @@
"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
import logging

from ..object import (
WEIGHTS, exact_match, list_reference_check, partial_string_based,
partial_timestamp_based, reference_check, semantically_equivalent,
WEIGHTS, _bucket_per_type, _object_pairs, exact_match,
list_reference_check, object_similarity, partial_string_based,
partial_timestamp_based, reference_check,
)

logger = logging.getLogger(__name__)


def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.
Args:
ds1: A DataStore object instance representing your graph
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
Returns:
bool: True if the result of the graph similarity is greater than or equal to
the threshold value. False otherwise.
Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weight_dict:
.. include:: ../../graph_default_sem_eq_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
if similarity_result >= threshold:
return True
return False


def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
This approach builds on top of the object-based semantic equivalence process
This approach builds on top of the object-based similarity process
and each comparison can return a value between 0 and 100.
Args:
Expand All @@ -22,20 +66,20 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
in the similarity process
Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence.
float: A number between 0.0 and 100.0 as a measurement of similarity.
Warning:
Object types need to have property weights defined for the equivalence process.
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weights_dict:
Default weight_dict:
.. include:: ../../graph_default_sem_eq_weights.rst
Expand All @@ -44,63 +88,62 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
see `the Committee Note <link here>`__.
"""
results = {}
similarity_score = 0
weights = GRAPH_WEIGHTS.copy()

if weight_dict:
weights.update(weight_dict)

results = {}
depth = weights["_internal"]["max_depth"]

graph1 = ds1.query([])
graph2 = ds2.query([])

graph1.sort(key=lambda x: x["type"])
graph2.sort(key=lambda x: x["type"])

if len(graph1) < len(graph2):
weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2
g1 = graph1
g2 = graph2
else:
weights["_internal"]["ds1"] = ds2
weights["_internal"]["ds2"] = ds1
g1 = graph2
g2 = graph1

for object1 in g1:
for object2 in g2:
if object1["type"] == object2["type"] and object1["type"] in weights:
iprop_score = {}
result = semantically_equivalent(object1, object2, iprop_score, **weights)
objects1_id = object1["id"]
weights["_internal"]["max_depth"] = depth

if objects1_id not in results:
results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
elif result > results[objects1_id]["value"]:
results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}

equivalence_score = 0
if weights["_internal"]["max_depth"] <= 0:
raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")

pairs = _object_pairs(
_bucket_per_type(ds1.query([])),
_bucket_per_type(ds2.query([])),
weights,
)

weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2

logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
for object1, object2 in pairs:
iprop_score = {}
object1_id = object1["id"]
object2_id = object2["id"]

result = object_similarity(object1, object2, iprop_score, **weights)

if object1_id not in results:
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
elif result > results[object1_id]["value"]:
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}

if object2_id not in results:
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}
elif result > results[object2_id]["value"]:
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}

matching_score = sum(x["value"] for x in results.values())
sum_weights = len(results) * 100.0
if sum_weights > 0:
equivalence_score = (matching_score / sum_weights) * 100
len_pairs = len(results)
if len_pairs > 0:
similarity_score = matching_score / len_pairs

prop_scores["matching_score"] = matching_score
prop_scores["sum_weights"] = sum_weights
prop_scores["len_pairs"] = len_pairs
prop_scores["summary"] = results

logger.debug(
"DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
sum_weights,
"DONE\t\tLEN_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
len_pairs,
matching_score,
equivalence_score,
similarity_score,
)
return equivalence_score
return similarity_score


# default weights used for the graph semantic equivalence process
# default weights used for the graph similarity process
GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({
"grouping": {
Expand Down
Loading

0 comments on commit 64608e7

Please sign in to comment.