Unstructured-IO · Klaijan · Oct 11, 2023 · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included.
 * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature
 setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed.
+* **Adds a function that calculates frequency of the element type and its depth** To capture the accuracy of element type extraction, this function counts the occurrences of each unique element type with its depth for use in element metrics.
 
 ### Fixes
 

diff --git a/example-docs/fake-email.txt b/example-docs/fake-email.txt
@@ -0,0 +1,24 @@
+MIME-Version: 1.0
+Date: Fri, 16 Dec 2022 17:04:16 -0500
+Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
+Subject: Test Email
+From: Matthew Robinson <[email protected]>
+To: Matthew Robinson <[email protected]>
+Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
+
+--00000000000095c9b205eff92630
+Content-Type: text/plain; charset="UTF-8"
+
+This is a test email to use for unit tests.
+
+Important points:
+
+   - Roses are red
+   - Violets are blue
+
+--00000000000095c9b205eff92630
+Content-Type: text/html; charset="UTF-8"
+
+<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
+
+--00000000000095c9b205eff92630--
diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py
@@ -0,0 +1,87 @@
+import pytest
+
+from unstructured.metrics.element_type import get_element_type_frequency
+from unstructured.partition.auto import partition
+
+
+@pytest.mark.parametrize(
+    ("filename", "frequency"),
+    [
+        (
+            "fake-email.txt",
+            {
+                "UncategorizedText": [("None", 6)],
+                "FigureCaption": [],
+                "Figure": [],
+                "Text": [],
+                "NarrativeText": [("None", 2)],
+                "ListItem": [("None", 12)],
+                "BulletedText": [],
+                "Title": [("None", 5)],
+                "Address": [],
+                "EmailAddress": [],
+                "Image": [],
+                "PageBreak": [],
+                "Table": [],
+                "Header": [],
+                "Footer": [],
+                "Caption": [],
+                "Footnote": [],
+                "Formula": [],
+                "List-item": [],
+                "Page-footer": [],
+                "Page-header": [],
+                "Picture": [],
+                "Section-header": [],
+                "Headline": [],
+                "Subheadline": [],
+                "Abstract": [],
+                "Threading": [],
+                "Form": [],
+                "Field-Name": [],
+                "Value": [],
+                "Link": [],
+            },
+        ),
+        (
+            "sample-presentation.pptx",
+            {
+                "UncategorizedText": [],
+                "FigureCaption": [],
+                "Figure": [],
+                "Text": [],
+                "NarrativeText": [("0", 3)],
+                "ListItem": [("0", 6), ("1", 6), ("2", 3)],
+                "BulletedText": [],
+                "Title": [("0", 4), ("1", 1)],
+                "Address": [],
+                "EmailAddress": [],
+                "Image": [],
+                "PageBreak": [],
+                "Table": [("None", 1)],
+                "Header": [],
+                "Footer": [],
+                "Caption": [],
+                "Footnote": [],
+                "Formula": [],
+                "List-item": [],
+                "Page-footer": [],
+                "Page-header": [],
+                "Picture": [],
+                "Section-header": [],
+                "Headline": [],
+                "Subheadline": [],
+                "Abstract": [],
+                "Threading": [],
+                "Form": [],
+                "Field-Name": [],
+                "Value": [],
+                "Link": [],
+            },
+        ),
+    ],
+)
+def test_get_element_type_frequency(filename, frequency):
+    elements = partition(filename=f"example-docs/{filename}")
+    elements_freq = get_element_type_frequency(elements)
+    assert elements_freq == frequency
diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py
@@ -0,0 +1,25 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+from unstructured.documents.elements import TYPE_TO_TEXT_ELEMENT_MAP
+
+
+def get_element_type_frequency(
+    elements: List,
+) -> Union[Dict[str, Tuple[Optional[str], int]], Dict]:
+    """
+    Calculate the frequency of Element Types from a list of elements.
+    """
+    frequency: Dict = {key: {} for key in TYPE_TO_TEXT_ELEMENT_MAP}
+    if len(elements) == 0:
+        return frequency
+    for element in elements:
+        category = element.category
+        category_depth = element.metadata.category_depth
+
+        if str(category_depth) not in frequency[category]:
+            frequency[category][str(category_depth)] = 1
+        else:
+            frequency[category][str(category_depth)] += 1
+    for key in frequency:
+        frequency[key] = list(frequency[key].items())
+    return frequency