diff --git a/CHANGELOG.md b/CHANGELOG.md index 31bd54b877..70db9c3b1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * **Refactor of the ingest cli workflow** The refactored approach uses a dynamically set pipeline with a snapshot along each step to save progress and accommodate continuation from a snapshot if an error occurs. This also allows the pipeline to dynamically assign any number of steps to modify the partitioned content before it gets written to a destination. ### Features +* **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included. * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. diff --git a/requirements/base.in b/requirements/base.in index 4a20b179c3..d66f49dd67 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -11,4 +11,5 @@ emoji dataclasses-json python-iso639 langdetect -numpy \ No newline at end of file +numpy +rapidfuzz \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index 3e68b38682..7be7d8b9b0 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -46,6 +46,8 @@ python-iso639==2023.6.15 # via -r requirements/base.in python-magic==0.4.27 # via -r requirements/base.in +rapidfuzz==3.3.1 + # via -r requirements/base.in regex==2023.10.3 # via nltk requests==2.31.0 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 37d38e4a93..9dabcefc96 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -172,7 +172,9 @@ pytz==2023.3.post1 pywavelets==1.4.1 # via scikit-image rapidfuzz==3.3.1 - # via unstructured-paddleocr + # via + # -c requirements/base.txt + # unstructured-paddleocr rarfile==4.1 # via visualdl requests==2.31.0 diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py new file mode 100644 index 0000000000..73bce5bd6f --- /dev/null +++ b/test_unstructured/metrics/test_text_extraction.py @@ -0,0 +1,69 @@ +import re + +import pytest + +from unstructured.metrics.text_extraction import calculate_edit_distance +from unstructured.partition.auto import partition + + +def test_calculate_edit_distance(): + source_cct = "I like pizza. I like bagels." + source_cct_word_space = "I like p i z z a . I like bagles." + source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct)) + source_cct_no_space = source_cct.replace(" ", "") + source_cct_one_sentence = "I like pizza." + source_cct_missing_word = "I like pizza. I like ." + source_cct_addn_char = "I like pizza. I like beagles." + source_cct_dup_word = "I like pizza pizza. I like bagels." + + assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0 + assert ( + round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2) + == 0.75 + ) + assert ( + round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39 + ) + assert ( + round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2) + == 0.64 + ) + assert ( + round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2) + == 0.0 + ) + assert ( + round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2) + == 0.57 + ) + assert ( + round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2) + == 0.89 + ) + assert ( + round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2) + == 0.79 + ) + + +@pytest.mark.parametrize( + ("filename", "expected_score", "expected_distance"), + [ + ("fake-text.txt", 0.78, 38), + ], +) +def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance): + with open("example-docs/fake-text.txt") as f: + source_cct = f.read() + + elements = partition(filename=f"example-docs/{filename}") + output_cct = "\n".join([str(el) for el in elements]) + + score = calculate_edit_distance(output_cct, source_cct, return_as="score") + distance = calculate_edit_distance(output_cct, source_cct, return_as="distance") + + assert score >= 0 + assert score <= 1.0 + assert distance >= 0 + assert round(score, 2) == expected_score + assert distance == expected_distance diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py new file mode 100644 index 0000000000..001bf1bd11 --- /dev/null +++ b/unstructured/metrics/text_extraction.py @@ -0,0 +1,52 @@ +from typing import Tuple + +from rapidfuzz.distance import Levenshtein + + +def calculate_edit_distance( + output: str, + source: str, + weights: Tuple[int, int, int] = (2, 1, 1), + return_as: str = "score", +) -> float: + """ + Calculates edit distance using Levenshtein distance between two strings. + + Args: + output (str): The target string to be compared. + source (str): The reference string against which 'output' is compared. + weights (Tuple[int, int, int], optional): A tuple containing weights + for insertion, deletion, and substitution operations in the edit + distance calculation. Default is (2, 1, 1). + return_as (str, optional): The type of result to return, one of + ["score",, "distance"]. + Default is "score". + + Returns: + float: The calculated edit distance or similarity score between + the 'output' and 'source' strings. + + Raises: + ValueError: If 'return_as' is not one of the valid return types + ["score", "distance"]. + + Note: + This function calculates the edit distance (or similarity score) between + two strings using the Levenshtein distance algorithm. The 'weights' parameter + allows customizing the cost of insertion, deletion, and substitution + operations. The 'return_as' parameter determines the type of result to return: + - "score": Returns the similarity score, where 1.0 indicates a perfect match. + - "distance": Returns the raw edit distance value. + + """ + return_types = ["score", "distance"] + if return_as not in return_types: + raise ValueError("Invalid return value type. Expected one of: %s" % return_types) + distance = Levenshtein.distance(output, source, weights=weights) + char_len = len(source) + bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0) + if return_as == "score": + return 1 - bounded_percentage_distance + elif return_as == "distance": + return distance + return 0.0