Unstructured-IO · cragwolfe · Oct 7, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 * **Refactor of the ingest cli workflow** The refactored approach uses a dynamically set pipeline with a snapshot along each step to save progress and accommodate continuation from a snapshot if an error occurs. This also allows the pipeline to dynamically assign any number of steps to modify the partitioned content before it gets written to a destination.
 ### Features
 
+* **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included.
 * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature
 setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed.
 

diff --git a/requirements/base.in b/requirements/base.in
@@ -11,4 +11,5 @@ emoji
 dataclasses-json
 python-iso639
 langdetect
-numpy
+numpy
+rapidfuzz
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -46,6 +46,8 @@ python-iso639==2023.6.15
     # via -r requirements/base.in
 python-magic==0.4.27
     # via -r requirements/base.in
+rapidfuzz==3.3.1
+    # via -r requirements/base.in
 regex==2023.10.3
     # via nltk
 requests==2.31.0

diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -172,7 +172,9 @@ pytz==2023.3.post1
 pywavelets==1.4.1
     # via scikit-image
 rapidfuzz==3.3.1
-    # via unstructured-paddleocr
+    # via
+    #   -c requirements/base.txt
+    #   unstructured-paddleocr
 rarfile==4.1
     # via visualdl
 requests==2.31.0

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
@@ -0,0 +1,69 @@
+import re
+
+import pytest
+
+from unstructured.metrics.text_extraction import calculate_edit_distance
+from unstructured.partition.auto import partition
+
+
+def test_calculate_edit_distance():
+    source_cct = "I like pizza. I like bagels."
+    source_cct_word_space = "I like p i z z a . I like bagles."
+    source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
+    source_cct_no_space = source_cct.replace(" ", "")
+    source_cct_one_sentence = "I like pizza."
+    source_cct_missing_word = "I like pizza. I like ."
+    source_cct_addn_char = "I like pizza. I like beagles."
+    source_cct_dup_word = "I like pizza pizza. I like bagels."
+
+    assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0
+    assert (
+        round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2)
+        == 0.75
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2)
+        == 0.64
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2)
+        == 0.0
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2)
+        == 0.57
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2)
+        == 0.89
+    )
+    assert (
+        round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2)
+        == 0.79
+    )
+
+
+@pytest.mark.parametrize(
+    ("filename", "expected_score", "expected_distance"),
+    [
+        ("fake-text.txt", 0.78, 38),
+    ],
+)
+def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
+    with open("example-docs/fake-text.txt") as f:
+        source_cct = f.read()
+
+    elements = partition(filename=f"example-docs/{filename}")
+    output_cct = "\n".join([str(el) for el in elements])
+
+    score = calculate_edit_distance(output_cct, source_cct, return_as="score")
+    distance = calculate_edit_distance(output_cct, source_cct, return_as="distance")
+
+    assert score >= 0
+    assert score <= 1.0
+    assert distance >= 0
+    assert round(score, 2) == expected_score
+    assert distance == expected_distance
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
@@ -0,0 +1,52 @@
+from typing import Tuple
+
+from rapidfuzz.distance import Levenshtein
+
+
+def calculate_edit_distance(
+    output: str,
+    source: str,
+    weights: Tuple[int, int, int] = (2, 1, 1),
+    return_as: str = "score",
+) -> float:
+    """
+    Calculates edit distance using Levenshtein distance between two strings.
+
+    Args:
+        output (str): The target string to be compared.
+        source (str): The reference string against which 'output' is compared.
+        weights (Tuple[int, int, int], optional): A tuple containing weights
+            for insertion, deletion, and substitution operations in the edit
+            distance calculation. Default is (2, 1, 1).
+        return_as (str, optional): The type of result to return, one of
+            ["score",, "distance"].
+            Default is "score".
+
+    Returns:
+        float: The calculated edit distance or similarity score between
+            the 'output' and 'source' strings.
+
+    Raises:
+        ValueError: If 'return_as' is not one of the valid return types
+        ["score", "distance"].
+
+    Note:
+        This function calculates the edit distance (or similarity score) between
+        two strings using the Levenshtein distance algorithm. The 'weights' parameter
+        allows customizing the cost of insertion, deletion, and substitution
+        operations. The 'return_as' parameter determines the type of result to return:
+        - "score": Returns the similarity score, where 1.0 indicates a perfect match.
+        - "distance": Returns the raw edit distance value.
+
+    """
+    return_types = ["score", "distance"]
+    if return_as not in return_types:
+        raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
+    distance = Levenshtein.distance(output, source, weights=weights)
+    char_len = len(source)
+    bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0)
+    if return_as == "score":
+        return 1 - bounded_percentage_distance
+    elif return_as == "distance":
+        return distance
+    return 0.0