Skip to content

Commit

Permalink
Merge pull request #496 from emmanvg/semantic-equivalence-part3
Browse files Browse the repository at this point in the history
Similarity/Equivalence Changes
  • Loading branch information
clenk authored Mar 10, 2021
2 parents 1735752 + c9e66de commit f155e3e
Show file tree
Hide file tree
Showing 8 changed files with 369 additions and 434 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ coverage.xml
# Sphinx documentation
docs/_build/
.ipynb_checkpoints
graph_default_sem_eq_weights.rst
object_default_sem_eq_weights.rst
similarity_weights.rst

# PyBuilder
target/
Expand Down
10 changes: 1 addition & 9 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from sphinx.ext.autodoc import ClassDocumenter

from stix2.base import _STIXBase
from stix2.equivalence.graph import GRAPH_WEIGHTS
from stix2.equivalence.object import WEIGHTS
from stix2.version import __version__

Expand Down Expand Up @@ -66,16 +65,9 @@
object_default_sem_eq_weights = object_default_sem_eq_weights.replace('\n', '\n ')
object_default_sem_eq_weights = object_default_sem_eq_weights.replace(' "', ' ')
object_default_sem_eq_weights = object_default_sem_eq_weights.replace('"\n', '\n')
with open('object_default_sem_eq_weights.rst', 'w') as f:
with open('similarity_weights.rst', 'w') as f:
f.write(".. code-block:: python\n\n {}\n\n".format(object_default_sem_eq_weights))

graph_default_sem_eq_weights = json.dumps(GRAPH_WEIGHTS, indent=4, default=lambda o: o.__name__)
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('\n', '\n ')
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace(' "', ' ')
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('"\n', '\n')
with open('graph_default_sem_eq_weights.rst', 'w') as f:
f.write(".. code-block:: python\n\n {}\n\n".format(graph_default_sem_eq_weights))


def get_property_type(prop):
"""Convert property classname into pretty string name of property.
Expand Down
11 changes: 1 addition & 10 deletions docs/guide/equivalence.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4607,20 +4607,11 @@
" ),\n",
"]\n",
"\n",
"\n",
"weights = {\n",
" \"_internal\": {\n",
" \"ignore_spec_version\": False,\n",
" \"versioning_checks\": False,\n",
" \"max_depth\": 1,\n",
" },\n",
"}\n",
"\n",
"memstore1 = MemoryStore(g1)\n",
"memstore2 = MemoryStore(g2)\n",
"prop_scores = {}\n",
"\n",
"similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores, **weights)\n",
"similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores)\n",
"equivalence_result = env.graph_equivalence(memstore1, memstore2, threshold=60)\n",
"\n",
"print(similarity_result)\n",
Expand Down
107 changes: 87 additions & 20 deletions stix2/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,31 @@ def creator_of(self, obj):
return None

@staticmethod
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
def object_similarity(
obj1, obj2, prop_scores={}, ds1=None, ds2=None,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a measure of how similar the two objects are.
Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ds1 (optional): A DataStore object instance from which to pull related objects
ds2 (optional): A DataStore object instance from which to pull related objects
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.
Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Expand All @@ -213,17 +228,24 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
Note:
Default weight_dict:
.. include:: ../object_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return object_similarity(obj1, obj2, prop_scores, **weight_dict)
return object_similarity(
obj1, obj2, prop_scores, ds1, ds2, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)

@staticmethod
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
def object_equivalence(
obj1, obj2, prop_scores={}, threshold=70, ds1=None, ds2=None,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given
threshold value.
Expand All @@ -236,8 +258,19 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both objects equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ds1 (optional): A DataStore object instance from which to pull related objects
ds2 (optional): A DataStore object instance from which to pull related objects
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.
Returns:
bool: True if the result of the object similarity is greater than or equal to
Expand All @@ -253,17 +286,23 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
Note:
Default weight_dict:
.. include:: ../object_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)
return object_equivalence(
obj1, obj2, prop_scores, threshold, ds1, ds2,
ignore_spec_version, versioning_checks, max_depth, **weight_dict
)

@staticmethod
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
def graph_similarity(
ds1, ds2, prop_scores={}, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict
):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
Expand All @@ -275,8 +314,17 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.
Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Expand All @@ -291,17 +339,24 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
Note:
Default weight_dict:
.. include:: ../graph_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
return graph_similarity(
ds1, ds2, prop_scores, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)

@staticmethod
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
def graph_equivalence(
ds1, ds2, prop_scores={}, threshold=70,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.
Expand All @@ -314,8 +369,17 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.
Returns:
bool: True if the result of the graph similarity is greater than or equal to
Expand All @@ -331,11 +395,14 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
Note:
Default weight_dict:
.. include:: ../graph_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict)
return graph_equivalence(
ds1, ds2, prop_scores, threshold, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)
Loading

0 comments on commit f155e3e

Please sign in to comment.