Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Similarity/Equivalence Changes #496

Merged
merged 10 commits into from
Mar 10, 2021
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ coverage.xml
# Sphinx documentation
docs/_build/
.ipynb_checkpoints
graph_default_sem_eq_weights.rst
object_default_sem_eq_weights.rst
similarity_weights.rst

# PyBuilder
target/
Expand Down
10 changes: 1 addition & 9 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from sphinx.ext.autodoc import ClassDocumenter

from stix2.base import _STIXBase
from stix2.equivalence.graph import GRAPH_WEIGHTS
from stix2.equivalence.object import WEIGHTS
from stix2.version import __version__

Expand Down Expand Up @@ -66,16 +65,9 @@
object_default_sem_eq_weights = object_default_sem_eq_weights.replace('\n', '\n ')
object_default_sem_eq_weights = object_default_sem_eq_weights.replace(' "', ' ')
object_default_sem_eq_weights = object_default_sem_eq_weights.replace('"\n', '\n')
with open('object_default_sem_eq_weights.rst', 'w') as f:
with open('similarity_weights.rst', 'w') as f:
f.write(".. code-block:: python\n\n {}\n\n".format(object_default_sem_eq_weights))

graph_default_sem_eq_weights = json.dumps(GRAPH_WEIGHTS, indent=4, default=lambda o: o.__name__)
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('\n', '\n ')
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace(' "', ' ')
graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('"\n', '\n')
with open('graph_default_sem_eq_weights.rst', 'w') as f:
f.write(".. code-block:: python\n\n {}\n\n".format(graph_default_sem_eq_weights))


def get_property_type(prop):
"""Convert property classname into pretty string name of property.
Expand Down
11 changes: 1 addition & 10 deletions docs/guide/equivalence.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4607,20 +4607,11 @@
" ),\n",
"]\n",
"\n",
"\n",
"weights = {\n",
" \"_internal\": {\n",
" \"ignore_spec_version\": False,\n",
" \"versioning_checks\": False,\n",
" \"max_depth\": 1,\n",
" },\n",
"}\n",
"\n",
"memstore1 = MemoryStore(g1)\n",
"memstore2 = MemoryStore(g2)\n",
"prop_scores = {}\n",
"\n",
"similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores, **weights)\n",
"similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores)\n",
"equivalence_result = env.graph_equivalence(memstore1, memstore2, threshold=60)\n",
"\n",
"print(similarity_result)\n",
Expand Down
107 changes: 87 additions & 20 deletions stix2/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,31 @@ def creator_of(self, obj):
return None

@staticmethod
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
def object_similarity(
obj1, obj2, prop_scores={}, ds1=None, ds2=None,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a measure of how similar the two objects are.

Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ds1 (optional): A DataStore object instance from which to pull related objects
ds2 (optional): A DataStore object instance from which to pull related objects
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.

Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Expand All @@ -213,17 +228,24 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
Note:
Default weight_dict:

.. include:: ../object_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return object_similarity(obj1, obj2, prop_scores, **weight_dict)
return object_similarity(
obj1, obj2, prop_scores, ds1, ds2, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)

@staticmethod
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
def object_equivalence(
obj1, obj2, prop_scores={}, threshold=70, ds1=None, ds2=None,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given
threshold value.
Expand All @@ -236,8 +258,19 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both objects equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ds1 (optional): A DataStore object instance from which to pull related objects
ds2 (optional): A DataStore object instance from which to pull related objects
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.

Returns:
bool: True if the result of the object similarity is greater than or equal to
Expand All @@ -253,17 +286,23 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
Note:
Default weight_dict:

.. include:: ../object_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)
return object_equivalence(
obj1, obj2, prop_scores, threshold, ds1, ds2,
ignore_spec_version, versioning_checks, max_depth, **weight_dict
)

@staticmethod
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
def graph_similarity(
ds1, ds2, prop_scores={}, ignore_spec_version=False,
emmanvg marked this conversation as resolved.
Show resolved Hide resolved
versioning_checks=False, max_depth=1, **weight_dict
):
"""This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
Expand All @@ -275,8 +314,17 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.

Returns:
float: A number between 0.0 and 100.0 as a measurement of similarity.
Expand All @@ -291,17 +339,24 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
Note:
Default weight_dict:

.. include:: ../graph_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
return graph_similarity(
ds1, ds2, prop_scores, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)

@staticmethod
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
def graph_equivalence(
ds1, ds2, prop_scores={}, threshold=70,
emmanvg marked this conversation as resolved.
Show resolved Hide resolved
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict
):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.
Expand All @@ -314,8 +369,17 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
ignore_spec_version: A boolean indicating whether to test object types
that belong to different spec versions (STIX 2.0 and STIX 2.1 for example).
If set to True this check will be skipped.
versioning_checks: A boolean indicating whether to test multiple revisions
of the same object (when present) to maximize similarity against a
particular version. If set to True the algorithm will perform this step.
max_depth: A positive integer indicating the maximum recursion depth the
algorithm can reach when de-referencing objects and performing the
object_similarity algorithm.
weight_dict: A dictionary that can be used to override what checks are done
to objects in the similarity process.

Returns:
bool: True if the result of the graph similarity is greater than or equal to
Expand All @@ -331,11 +395,14 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
Note:
Default weight_dict:

.. include:: ../graph_default_sem_eq_weights.rst
.. include:: ../similarity_weights.rst

Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.

"""
return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict)
return graph_equivalence(
ds1, ds2, prop_scores, threshold, ignore_spec_version,
versioning_checks, max_depth, **weight_dict
)
Loading