diff --git a/CHANGELOG.md b/CHANGELOG.md index 424ca2201a..cb0203bbe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included. * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. +* **Adds a function that calculates frequency of the element type and its depth** To capture the accuracy of element type extraction, this function counts the occurrences of each unique element type with its depth for use in element metrics. ### Fixes diff --git a/example-docs/fake-email.txt b/example-docs/fake-email.txt new file mode 100644 index 0000000000..702a408521 --- /dev/null +++ b/example-docs/fake-email.txt @@ -0,0 +1,24 @@ +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" + +This is a test email to use for unit tests. + +Important points: + + - Roses are red + - Violets are blue + +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- \ No newline at end of file diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py new file mode 100644 index 0000000000..b1ba2094f0 --- /dev/null +++ b/test_unstructured/metrics/test_element_type.py @@ -0,0 +1,37 @@ +import pytest + +from unstructured.metrics.element_type import get_element_type_frequency +from unstructured.partition.auto import partition +from unstructured.staging.base import elements_to_json + + +@pytest.mark.parametrize( + ("filename", "frequency"), + [ + ( + "fake-email.txt", + { + ("UncategorizedText", None): 6, + ("ListItem", None): 12, + ("Title", None): 5, + ("NarrativeText", None): 2, + }, + ), + ( + "sample-presentation.pptx", + { + ("Title", 0): 4, + ("Title", 1): 1, + ("NarrativeText", 0): 3, + ("ListItem", 0): 6, + ("ListItem", 1): 6, + ("ListItem", 2): 3, + ("Table", None): 1, + }, + ), + ], +) +def test_get_element_type_frequency(filename, frequency): + elements = partition(filename=f"example-docs/{filename}") + elements_freq = get_element_type_frequency(elements_to_json(elements)) + assert elements_freq == frequency diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py new file mode 100644 index 0000000000..a54aacd3ce --- /dev/null +++ b/unstructured/metrics/element_type.py @@ -0,0 +1,22 @@ +import json +from typing import Dict, Optional, Tuple, Union + + +def get_element_type_frequency( + elements: str, +) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]: + """ + Calculate the frequency of Element Types from a list of elements. + """ + frequency: Dict = {} + if len(elements) == 0: + return frequency + for element in json.loads(elements): + type = element.get("type") + category_depth = element["metadata"].get("category_depth") + key = (type, category_depth) + if key not in frequency: + frequency[key] = 1 + else: + frequency[key] += 1 + return frequency