Merge pull request #491 from emmanvg/graph-eq-changes

Graph Equivalence Changes
oasis-open · Feb 19, 2021 · 64608e7 · 64608e7
2 parents f9ca684 + 9945377
commit 64608e7
Show file tree

Hide file tree

Showing 6 changed files with 995 additions and 240 deletions.
diff --git a/stix2/environment.py b/stix2/environment.py
@@ -2,18 +2,10 @@
 import copy
 
 from .datastore import CompositeDataSource, DataStoreMixin
-from .equivalence.graph import graphically_equivalent
-from .equivalence.object import (  # noqa: F401
-    WEIGHTS, check_property_present, custom_pattern_based, exact_match,
-    list_reference_check, partial_external_reference_based, partial_list_based,
-    partial_location_distance, partial_string_based, partial_timestamp_based,
-    reference_check, semantically_equivalent,
-)
+from .equivalence.graph import graph_equivalence, graph_similarity
+from .equivalence.object import object_equivalence, object_similarity
 from .parsing import parse as _parse
 
-# TODO: Remove all unused imports that now belong to the equivalence module in the next major release.
-# Kept for backwards compatibility.
-
 
 class ObjectFactory(object):
     """Easily create STIX objects with default values for certain properties.
@@ -197,23 +189,62 @@ def creator_of(self, obj):
             return None
 
     @staticmethod
-    def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
-        """This method verifies if two objects of the same type are
-        semantically equivalent.
+    def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
+        """This method returns a measure of how similar the two objects are.
+
+        Args:
+            obj1: A stix2 object instance
+            obj2: A stix2 object instance
+            prop_scores: A dictionary that can hold individual property scores,
+                weights, contributing score, matching score and sum of weights.
+            weight_dict: A dictionary that can be used to override settings
+                in the similarity process
+
+        Returns:
+            float: A number between 0.0 and 100.0 as a measurement of similarity.
+
+        Warning:
+            Object types need to have property weights defined for the similarity process.
+            Otherwise, those objects will not influence the final score. The WEIGHTS
+            dictionary under `stix2.equivalence.object` can give you an idea on how to add
+            new entries and pass them via the `weight_dict` argument. Similarly, the values
+            or methods can be fine tuned for a particular use case.
+
+        Note:
+            Default weight_dict:
+
+            .. include:: ../object_default_sem_eq_weights.rst
+
+        Note:
+            This implementation follows the Semantic Equivalence Committee Note.
+            see `the Committee Note <link here>`__.
+
+        """
+        return object_similarity(obj1, obj2, prop_scores, **weight_dict)
+
+    @staticmethod
+    def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
+        """This method returns a true/false value if two objects are semantically equivalent.
+        Internally, it calls the object_similarity function and compares it against the given
+        threshold value.
 
         Args:
             obj1: A stix2 object instance
             obj2: A stix2 object instance
             prop_scores: A dictionary that can hold individual property scores,
                 weights, contributing score, matching score and sum of weights.
+            threshold: A numerical value between 0 and 100 to determine the minimum
+                score to result in successfully calling both objects equivalent. This
+                value can be tuned.
             weight_dict: A dictionary that can be used to override settings
-                in the semantic equivalence process
+                in the similarity process
 
         Returns:
-            float: A number between 0.0 and 100.0 as a measurement of equivalence.
+            bool: True if the result of the object similarity is greater than or equal to
+                the threshold value. False otherwise.
 
         Warning:
-            Object types need to have property weights defined for the equivalence process.
+            Object types need to have property weights defined for the similarity process.
             Otherwise, those objects will not influence the final score. The WEIGHTS
             dictionary under `stix2.equivalence.object` can give you an idea on how to add
             new entries and pass them via the `weight_dict` argument. Similarly, the values
@@ -229,14 +260,14 @@ def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
             see `the Committee Note <link here>`__.
 
         """
-        return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict)
+        return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)
 
     @staticmethod
-    def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
-        """This method verifies if two graphs are semantically equivalent.
+    def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
+        """This method returns a similarity score for two given graphs.
         Each DataStore can contain a connected or disconnected graph and the
         final result is weighted over the amount of objects we managed to compare.
-        This approach builds on top of the object-based semantic equivalence process
+        This approach builds on top of the object-based similarity process
         and each comparison can return a value between 0 and 100.
 
         Args:
@@ -245,13 +276,53 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
             prop_scores: A dictionary that can hold individual property scores,
                 weights, contributing score, matching score and sum of weights.
             weight_dict: A dictionary that can be used to override settings
-                in the semantic equivalence process
+                in the similarity process
+
+        Returns:
+            float: A number between 0.0 and 100.0 as a measurement of similarity.
+
+        Warning:
+            Object types need to have property weights defined for the similarity process.
+            Otherwise, those objects will not influence the final score. The WEIGHTS
+            dictionary under `stix2.equivalence.graph` can give you an idea on how to add
+            new entries and pass them via the `weight_dict` argument. Similarly, the values
+            or methods can be fine tuned for a particular use case.
+
+        Note:
+            Default weight_dict:
+
+            .. include:: ../graph_default_sem_eq_weights.rst
+
+        Note:
+            This implementation follows the Semantic Equivalence Committee Note.
+            see `the Committee Note <link here>`__.
+
+        """
+        return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
+
+    @staticmethod
+    def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
+        """This method returns a true/false value if two graphs are semantically equivalent.
+        Internally, it calls the graph_similarity function and compares it against the given
+        threshold value.
+
+        Args:
+            ds1: A DataStore object instance representing your graph
+            ds2: A DataStore object instance representing your graph
+            prop_scores: A dictionary that can hold individual property scores,
+                weights, contributing score, matching score and sum of weights.
+            threshold: A numerical value between 0 and 100 to determine the minimum
+                score to result in successfully calling both graphs equivalent. This
+                value can be tuned.
+            weight_dict: A dictionary that can be used to override settings
+                in the similarity process
 
         Returns:
-            float: A number between 0.0 and 100.0 as a measurement of equivalence.
+            bool: True if the result of the graph similarity is greater than or equal to
+                the threshold value. False otherwise.
 
         Warning:
-            Object types need to have property weights defined for the equivalence process.
+            Object types need to have property weights defined for the similarity process.
             Otherwise, those objects will not influence the final score. The WEIGHTS
             dictionary under `stix2.equivalence.graph` can give you an idea on how to add
             new entries and pass them via the `weight_dict` argument. Similarly, the values
@@ -267,4 +338,4 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
             see `the Committee Note <link here>`__.
 
         """
-        return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict)
+        return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict)
diff --git a/stix2/equivalence/__init__.py b/stix2/equivalence/__init__.py
@@ -1,4 +1,4 @@
-"""Python APIs for STIX 2 Semantic Equivalence.
+"""Python APIs for STIX 2 Semantic Equivalence and Similarity.
 
 .. autosummary::
    :toctree: equivalence

diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py
@@ -1,19 +1,63 @@
-"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
+"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
 import logging
 
 from ..object import (
-    WEIGHTS, exact_match, list_reference_check, partial_string_based,
-    partial_timestamp_based, reference_check, semantically_equivalent,
+    WEIGHTS, _bucket_per_type, _object_pairs, exact_match,
+    list_reference_check, object_similarity, partial_string_based,
+    partial_timestamp_based, reference_check,
 )
 
 logger = logging.getLogger(__name__)
 
 
-def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
-    """This method verifies if two graphs are semantically equivalent.
+def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
+    """This method returns a true/false value if two graphs are semantically equivalent.
+    Internally, it calls the graph_similarity function and compares it against the given
+    threshold value.
+
+    Args:
+        ds1: A DataStore object instance representing your graph
+        ds2: A DataStore object instance representing your graph
+        prop_scores: A dictionary that can hold individual property scores,
+            weights, contributing score, matching score and sum of weights.
+        threshold: A numerical value between 0 and 100 to determine the minimum
+            score to result in successfully calling both graphs equivalent. This
+            value can be tuned.
+        weight_dict: A dictionary that can be used to override settings
+            in the similarity process
+
+    Returns:
+        bool: True if the result of the graph similarity is greater than or equal to
+            the threshold value. False otherwise.
+
+    Warning:
+        Object types need to have property weights defined for the similarity process.
+        Otherwise, those objects will not influence the final score. The WEIGHTS
+        dictionary under `stix2.equivalence.graph` can give you an idea on how to add
+        new entries and pass them via the `weight_dict` argument. Similarly, the values
+        or methods can be fine tuned for a particular use case.
+
+    Note:
+        Default weight_dict:
+
+        .. include:: ../../graph_default_sem_eq_weights.rst
+
+    Note:
+        This implementation follows the Semantic Equivalence Committee Note.
+        see `the Committee Note <link here>`__.
+
+    """
+    similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
+    if similarity_result >= threshold:
+        return True
+    return False
+
+
+def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
+    """This method returns a similarity score for two given graphs.
     Each DataStore can contain a connected or disconnected graph and the
     final result is weighted over the amount of objects we managed to compare.
-    This approach builds on top of the object-based semantic equivalence process
+    This approach builds on top of the object-based similarity process
     and each comparison can return a value between 0 and 100.
 
     Args:
@@ -22,20 +66,20 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
         prop_scores: A dictionary that can hold individual property scores,
             weights, contributing score, matching score and sum of weights.
         weight_dict: A dictionary that can be used to override settings
-            in the semantic equivalence process
+            in the similarity process
 
     Returns:
-        float: A number between 0.0 and 100.0 as a measurement of equivalence.
+        float: A number between 0.0 and 100.0 as a measurement of similarity.
 
     Warning:
-        Object types need to have property weights defined for the equivalence process.
+        Object types need to have property weights defined for the similarity process.
         Otherwise, those objects will not influence the final score. The WEIGHTS
         dictionary under `stix2.equivalence.graph` can give you an idea on how to add
         new entries and pass them via the `weight_dict` argument. Similarly, the values
         or methods can be fine tuned for a particular use case.
 
     Note:
-        Default weights_dict:
+        Default weight_dict:
 
         .. include:: ../../graph_default_sem_eq_weights.rst
 
@@ -44,63 +88,62 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
         see `the Committee Note <link here>`__.
 
     """
+    results = {}
+    similarity_score = 0
     weights = GRAPH_WEIGHTS.copy()
 
     if weight_dict:
         weights.update(weight_dict)
 
-    results = {}
-    depth = weights["_internal"]["max_depth"]
-
-    graph1 = ds1.query([])
-    graph2 = ds2.query([])
-
-    graph1.sort(key=lambda x: x["type"])
-    graph2.sort(key=lambda x: x["type"])
-
-    if len(graph1) < len(graph2):
-        weights["_internal"]["ds1"] = ds1
-        weights["_internal"]["ds2"] = ds2
-        g1 = graph1
-        g2 = graph2
-    else:
-        weights["_internal"]["ds1"] = ds2
-        weights["_internal"]["ds2"] = ds1
-        g1 = graph2
-        g2 = graph1
-
-    for object1 in g1:
-        for object2 in g2:
-            if object1["type"] == object2["type"] and object1["type"] in weights:
-                iprop_score = {}
-                result = semantically_equivalent(object1, object2, iprop_score, **weights)
-                objects1_id = object1["id"]
-                weights["_internal"]["max_depth"] = depth
-
-                if objects1_id not in results:
-                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
-                elif result > results[objects1_id]["value"]:
-                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
-
-    equivalence_score = 0
+    if weights["_internal"]["max_depth"] <= 0:
+        raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
+
+    pairs = _object_pairs(
+        _bucket_per_type(ds1.query([])),
+        _bucket_per_type(ds2.query([])),
+        weights,
+    )
+
+    weights["_internal"]["ds1"] = ds1
+    weights["_internal"]["ds2"] = ds2
+
+    logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
+    for object1, object2 in pairs:
+        iprop_score = {}
+        object1_id = object1["id"]
+        object2_id = object2["id"]
+
+        result = object_similarity(object1, object2, iprop_score, **weights)
+
+        if object1_id not in results:
+            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
+        elif result > results[object1_id]["value"]:
+            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
+
+        if object2_id not in results:
+            results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}
+        elif result > results[object2_id]["value"]:
+            results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result}
+
     matching_score = sum(x["value"] for x in results.values())
-    sum_weights = len(results) * 100.0
-    if sum_weights > 0:
-        equivalence_score = (matching_score / sum_weights) * 100
+    len_pairs = len(results)
+    if len_pairs > 0:
+        similarity_score = matching_score / len_pairs
+
     prop_scores["matching_score"] = matching_score
-    prop_scores["sum_weights"] = sum_weights
+    prop_scores["len_pairs"] = len_pairs
     prop_scores["summary"] = results
 
     logger.debug(
-        "DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
-        sum_weights,
+        "DONE\t\tLEN_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
+        len_pairs,
         matching_score,
-        equivalence_score,
+        similarity_score,
     )
-    return equivalence_score
+    return similarity_score
 
 
-# default weights used for the graph semantic equivalence process
+# default weights used for the graph similarity process
 GRAPH_WEIGHTS = WEIGHTS.copy()
 GRAPH_WEIGHTS.update({
     "grouping": {