From fe1767d019179b03ac4c2746d2ec924ad02d2c54 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 28 Sep 2023 10:55:21 -0500 Subject: [PATCH 01/29] local embedding model from huggingface --- unstructured/embed/huggingface.py | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 unstructured/embed/huggingface.py diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py new file mode 100644 index 0000000000..6df0911fd4 --- /dev/null +++ b/unstructured/embed/huggingface.py @@ -0,0 +1,72 @@ +import types +from typing import List, Optional + +import numpy as np + +from unstructured.documents.elements import ( + Element, +) +from unstructured.embed.interfaces import BaseEmbeddingEncoder +from unstructured.ingest.error import EmbeddingEncoderConnectionError +from unstructured.utils import requires_dependencies + + +class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder): + def __init__( + self, + model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2", + model_kwargs: Optional[dict] = {"device": "cpu"}, + encode_kwargs: Optional[dict] = {"normalize_embeddings": False}, + ): + self.model_name = model_name + self.model_kwargs = model_kwargs + self.encode_kwargs = encode_kwargs + self.initialize() + + @EmbeddingEncoderConnectionError.wrap + @requires_dependencies( + ["langchain", "huggingface", "tiktoken"], + extras="huggingface", + ) + def initialize(self): + """Creates a langchain HuggingFace object to embed elements.""" + from langchain.embeddings import HuggingFaceEmbeddings + + self.hf = HuggingFaceEmbeddings( + model=self.model_name, + model_kwargs=self.model_kwargs, + encode_kwargs=self.encode_kwargs, + ) + self.examplary_embedding = self.hf.embed_query("Q") + + return self.hf + + def num_of_dimensions(self): + return np.shape(self.examplary_embedding) + + def is_unit_vector(self): + return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0) + + def embed_query(self, query): + return self.hf.embed_documents([str(query)]) + + def embed_documents(self, elements: List[Element]) -> List[Element]: + embeddings = self.hf.embed_documents([str(e) for e in elements]) + elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) + return elements_with_embeddings + + def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]: + assert len(elements) == len(embeddings) + elements_w_embedding = [] + + for i, element in enumerate(elements): + original_method = element.to_dict + + def new_to_dict(self): + d = original_method() + d["embeddings"] = self.embeddings + return d + + element.embeddings = embeddings[i] + elements_w_embedding.append(element) + element.to_dict = types.MethodType(new_to_dict, element) From 672bc8d0c7def7b103cb5893fa210be581e18327 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 2 Oct 2023 15:36:31 -0500 Subject: [PATCH 02/29] add arguments --- unstructured/embed/huggingface.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py index 6df0911fd4..45df7ba1df 100644 --- a/unstructured/embed/huggingface.py +++ b/unstructured/embed/huggingface.py @@ -3,6 +3,8 @@ import numpy as np +from langchain.embeddings import HuggingFaceEmbeddings + from unstructured.documents.elements import ( Element, ) @@ -14,29 +16,25 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder): def __init__( self, - model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2", + model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2", model_kwargs: Optional[dict] = {"device": "cpu"}, encode_kwargs: Optional[dict] = {"normalize_embeddings": False}, ): self.model_name = model_name self.model_kwargs = model_kwargs self.encode_kwargs = encode_kwargs + self.initialize() - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["langchain", "huggingface", "tiktoken"], - extras="huggingface", - ) def initialize(self): """Creates a langchain HuggingFace object to embed elements.""" - from langchain.embeddings import HuggingFaceEmbeddings self.hf = HuggingFaceEmbeddings( - model=self.model_name, + model_name=self.model_name, model_kwargs=self.model_kwargs, encode_kwargs=self.encode_kwargs, ) + self.examplary_embedding = self.hf.embed_query("Q") return self.hf From a6f9fbbedffb974cb2aeb28743ab4b498c10734d Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Tue, 3 Oct 2023 12:01:08 -0500 Subject: [PATCH 03/29] begin coding bag of words --- unstructured/cleaners/core.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index ba7ec592db..1ddc1a3200 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -6,6 +6,8 @@ import numpy as np +import nltkfrom nltk.corpus import stopwords + from unstructured.file_utils.encoding import ( format_encoding_str, ) @@ -458,3 +460,15 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: return int(index - moved_indices[index]) + + +def bag_of_words(text: str, remove_stop_words: bool) -> List[Text]: + words = set(remove_punctuation(text.lower()).split()) + + clean_words = [] + if remove_stop_words: + for word in words: + if word not in stopwords.words('english'): + clean_words.append(word) + + \ No newline at end of file From 8511de1ec9eb7a96b311d5aba3d325a720565bd6 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 09:46:54 -0500 Subject: [PATCH 04/29] bag of words function --- unstructured/cleaners/core.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 1ddc1a3200..39fe0b5684 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,12 +2,11 @@ import re import sys import unicodedata +from collections import Counter from typing import Tuple import numpy as np -import nltkfrom nltk.corpus import stopwords - from unstructured.file_utils.encoding import ( format_encoding_str, ) @@ -462,13 +461,7 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: return int(index - moved_indices[index]) -def bag_of_words(text: str, remove_stop_words: bool) -> List[Text]: - words = set(remove_punctuation(text.lower()).split()) - - clean_words = [] - if remove_stop_words: - for word in words: - if word not in stopwords.words('english'): - clean_words.append(word) - +def bag_of_words(text: str) -> dict[str]: + words = remove_punctuation(text.lower()).split() + return dict(Counter(words)) \ No newline at end of file From 2722e09d0ee27b102e6cb2155d196e5f93bffa33 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 11:38:08 -0500 Subject: [PATCH 05/29] fix syntax --- test_unstructured/cleaners/test_core.py | 13 +++++++++++++ unstructured/cleaners/core.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index eec8edd2b9..579fc0d2f2 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -300,3 +300,16 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc def test_bytes_string_to_string(): text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ( + "The dog loved the cat, but the cat loved the cow", + {"the": 4, "cat": 1, "loved": 2, "dog": 2, "but": 1, "cow": 1}, + ), + ], +) +def test_bag_of_words(text, expected): + assert core.bag_of_words(text) == expected diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 39fe0b5684..f23d930e27 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -461,7 +461,7 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: return int(index - moved_indices[index]) -def bag_of_words(text: str) -> dict[str]: +def bag_of_words(text: str) -> dict: words = remove_punctuation(text.lower()).split() return dict(Counter(words)) \ No newline at end of file From ed42bc18911b959f382770da5c880239d83bec76 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 11:55:16 -0500 Subject: [PATCH 06/29] format --- unstructured/cleaners/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index f23d930e27..f589674c2f 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -464,4 +464,3 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: def bag_of_words(text: str) -> dict: words = remove_punctuation(text.lower()).split() return dict(Counter(words)) - \ No newline at end of file From 332c70adf0dd82cdfce7293214b087f05879c99b Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 11:59:41 -0500 Subject: [PATCH 07/29] remove unwanted file --- unstructured/embed/huggingface.py | 70 ------------------------------- 1 file changed, 70 deletions(-) delete mode 100644 unstructured/embed/huggingface.py diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py deleted file mode 100644 index 45df7ba1df..0000000000 --- a/unstructured/embed/huggingface.py +++ /dev/null @@ -1,70 +0,0 @@ -import types -from typing import List, Optional - -import numpy as np - -from langchain.embeddings import HuggingFaceEmbeddings - -from unstructured.documents.elements import ( - Element, -) -from unstructured.embed.interfaces import BaseEmbeddingEncoder -from unstructured.ingest.error import EmbeddingEncoderConnectionError -from unstructured.utils import requires_dependencies - - -class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder): - def __init__( - self, - model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2", - model_kwargs: Optional[dict] = {"device": "cpu"}, - encode_kwargs: Optional[dict] = {"normalize_embeddings": False}, - ): - self.model_name = model_name - self.model_kwargs = model_kwargs - self.encode_kwargs = encode_kwargs - - self.initialize() - - def initialize(self): - """Creates a langchain HuggingFace object to embed elements.""" - - self.hf = HuggingFaceEmbeddings( - model_name=self.model_name, - model_kwargs=self.model_kwargs, - encode_kwargs=self.encode_kwargs, - ) - - self.examplary_embedding = self.hf.embed_query("Q") - - return self.hf - - def num_of_dimensions(self): - return np.shape(self.examplary_embedding) - - def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0) - - def embed_query(self, query): - return self.hf.embed_documents([str(query)]) - - def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.hf.embed_documents([str(e) for e in elements]) - elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) - return elements_with_embeddings - - def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]: - assert len(elements) == len(embeddings) - elements_w_embedding = [] - - for i, element in enumerate(elements): - original_method = element.to_dict - - def new_to_dict(self): - d = original_method() - d["embeddings"] = self.embeddings - return d - - element.embeddings = embeddings[i] - elements_w_embedding.append(element) - element.to_dict = types.MethodType(new_to_dict, element) From 81ba8759eb00ad7846014ec51d471ab5848660b6 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 14:29:41 -0500 Subject: [PATCH 08/29] update changelog and version --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d5d1843b2..abc546c48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.20-dev2 +## 0.10.20-dev3 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 680eaf3a9a..57943dc290 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20-dev2" # pragma: no cover +__version__ = "0.10.20-dev3" # pragma: no cover From c4114f7152ecb7c556a9b133e728dc21ab36a84c Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 14:54:42 -0500 Subject: [PATCH 09/29] fix test --- test_unstructured/cleaners/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 579fc0d2f2..d78bc1f01c 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -307,7 +307,7 @@ def test_bytes_string_to_string(): [ ( "The dog loved the cat, but the cat loved the cow", - {"the": 4, "cat": 1, "loved": 2, "dog": 2, "but": 1, "cow": 1}, + {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, ), ], ) From 71b5656eaa810172ff088bf4a47d676bebf4715e Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 15:41:24 -0500 Subject: [PATCH 10/29] added test --- test_unstructured/cleaners/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index d78bc1f01c..bee1dfd350 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -309,6 +309,10 @@ def test_bytes_string_to_string(): "The dog loved the cat, but the cat loved the cow", {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, ), + ( + "i n t r o d u c t i o n", + {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1}, + ), ], ) def test_bag_of_words(text, expected): From 2e041198e70acbac4764b09d801a1eb05c04c6ce Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 17:29:53 -0500 Subject: [PATCH 11/29] redo logic for bag of words --- unstructured/cleaners/core.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index f589674c2f..4b44baab07 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,7 +2,6 @@ import re import sys import unicodedata -from collections import Counter from typing import Tuple import numpy as np @@ -462,5 +461,27 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: def bag_of_words(text: str) -> dict: + incorrect_word = "" + bow = {} + words = remove_punctuation(text.lower()).split() - return dict(Counter(words)) + i = 0 + while i < len(words): + if len(words[i]) > 1: + if words[i] in bow.keys(): + bow[words[i]] += 1 + else: + bow[words[i]] = 1 + i += 1 + else: + j = i + while j < len(words) and len(words[j]) == 1: + incorrect_word += words[j] + j += 1 + if len(incorrect_word) == 1: + bow[incorrect_word] = 1 + else: + incorrect_word = " ".join(list(incorrect_word)) + bow[incorrect_word] = 1 + i = j + return bow From 5d1769a4e2a67a12bd5342a6388683f62ad40a33 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Thu, 5 Oct 2023 17:34:09 -0500 Subject: [PATCH 12/29] update tests --- test_unstructured/cleaners/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index bee1dfd350..c04966d047 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -313,6 +313,10 @@ def test_bytes_string_to_string(): "i n t r o d u c t i o n", {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1}, ), + ( + "Hello my name is H a r p e r, what's your name?", + {"hello": 1, "my": 1, "name": 2, "is": 1, "h a r p e r": 1, "whats": 1, "your": 1}, + ), ], ) def test_bag_of_words(text, expected): From f8ecffad5009f9221f2df51021d5649c3d613955 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Fri, 6 Oct 2023 09:19:44 -0500 Subject: [PATCH 13/29] remove funky words --- test_unstructured/cleaners/test_core.py | 6 +----- unstructured/cleaners/core.py | 11 +---------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index c04966d047..85c345db10 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -309,13 +309,9 @@ def test_bytes_string_to_string(): "The dog loved the cat, but the cat loved the cow", {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, ), - ( - "i n t r o d u c t i o n", - {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1}, - ), ( "Hello my name is H a r p e r, what's your name?", - {"hello": 1, "my": 1, "name": 2, "is": 1, "h a r p e r": 1, "whats": 1, "your": 1}, + {"hello": 1, "my": 1, "name": 2, "is": 1, "whats": 1, "your": 1}, ), ], ) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 4b44baab07..17a20d3589 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -474,14 +474,5 @@ def bag_of_words(text: str) -> dict: bow[words[i]] = 1 i += 1 else: - j = i - while j < len(words) and len(words[j]) == 1: - incorrect_word += words[j] - j += 1 - if len(incorrect_word) == 1: - bow[incorrect_word] = 1 - else: - incorrect_word = " ".join(list(incorrect_word)) - bow[incorrect_word] = 1 - i = j + i += 1 return bow From 010477a47b01a03eaf5b21f98f17cdabc6207697 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Fri, 6 Oct 2023 10:19:43 -0500 Subject: [PATCH 14/29] update version --- CHANGELOG.md | 4 +++- unstructured/__version__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe638b4aff..744defb7fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.20-dev3 +## 0.10.20-dev5 ### Enhancements @@ -9,6 +9,8 @@ ### Features +* **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy. + ### Fixes * **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 57943dc290..adcfc625cb 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20-dev3" # pragma: no cover +__version__ = "0.10.20-dev5" # pragma: no cover From b36a310df864a8d770bdd6fec0939a5f8d6ba312 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 15:15:36 -0500 Subject: [PATCH 15/29] fix bag of words and move code to correct files --- test_unstructured/cleaners/test_core.py | 17 ------- .../metrics/test_text_extraction.py | 21 +++++++++ unstructured/cleaners/core.py | 22 ++------- unstructured/metrics/text_extraction.py | 47 ++++++++++++++++++- 4 files changed, 70 insertions(+), 37 deletions(-) diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 85c345db10..eec8edd2b9 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -300,20 +300,3 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc def test_bytes_string_to_string(): text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" - - -@pytest.mark.parametrize( - ("text", "expected"), - [ - ( - "The dog loved the cat, but the cat loved the cow", - {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, - ), - ( - "Hello my name is H a r p e r, what's your name?", - {"hello": 1, "my": 1, "name": 2, "is": 1, "whats": 1, "your": 1}, - ), - ], -) -def test_bag_of_words(text, expected): - assert core.bag_of_words(text) == expected diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 73bce5bd6f..691a607845 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -67,3 +67,24 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte assert distance >= 0 assert round(score, 2) == expected_score assert distance == expected_distance + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ( + "The dog loved the cat, but the cat loved the cow", + {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, + ), + ( + "Hello my name is H a r p e r, what's your name?", + {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1}, + ), + ( + "I have a dog and a cat, I love my dog.", + {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, + ), + ], +) +def test_bag_of_words(text, expected): + assert core.bag_of_words(text) == expected diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 17a20d3589..00dbfcc6a6 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -296,7 +296,9 @@ def replace_unicode_quotes(text) -> str: ) -def remove_punctuation(s: str) -> str: +def remove_punctuation( + s: str, +) -> str: """Removes punctuation from a given string.""" s = s.translate(tbl) return s @@ -458,21 +460,3 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: return int(index - moved_indices[index]) - - -def bag_of_words(text: str) -> dict: - incorrect_word = "" - bow = {} - - words = remove_punctuation(text.lower()).split() - i = 0 - while i < len(words): - if len(words[i]) > 1: - if words[i] in bow.keys(): - bow[words[i]] += 1 - else: - bow[words[i]] = 1 - i += 1 - else: - i += 1 - return bow diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 001bf1bd11..0fac8c23d6 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,4 +1,6 @@ -from typing import Tuple +import sys +import unicodedata +from typing import Tuple, Optional from rapidfuzz.distance import Levenshtein @@ -50,3 +52,46 @@ def calculate_edit_distance( elif return_as == "distance": return distance return 0.0 + + +# Duplicate code from cleaners.core, not sure we want this functionality introduced in the main library. +def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: + """Removes punctuation from a given string.""" + + tbl = dict.fromkeys( + i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") + ) + + if exclude_punctuation: + for punct in exclude_punctuation: + del tbl[ord(punct)] + s = s.translate(tbl) + return s + + +def bag_of_words(text: str) -> dict: + bow = {} + words = remove_punctuation(text.lower(), ["-", "'"]).split() + + i = 0 + while i < len(words): + if len(words[i]) > 1: + if words[i] in bow.keys(): + bow[words[i]] += 1 + else: + bow[words[i]] = 1 + i += 1 + else: + j = i + incorrect_word = "" + while j < len(words) and len(words[j]) == 1: + incorrect_word += words[j] + j += 1 + + if len(incorrect_word) == 1: + if incorrect_word in bow.keys(): + bow[incorrect_word] += 1 + else: + bow[incorrect_word] = 1 + i = j + return bow From 7e0605457ed1d64704ddc80993b8a1ce76412a5c Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 15:49:36 -0500 Subject: [PATCH 16/29] formatting --- unstructured/metrics/text_extraction.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 0fac8c23d6..36e21b9322 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,6 +1,6 @@ import sys import unicodedata -from typing import Tuple, Optional +from typing import Optional, Tuple from rapidfuzz.distance import Levenshtein @@ -54,7 +54,8 @@ def calculate_edit_distance( return 0.0 -# Duplicate code from cleaners.core, not sure we want this functionality introduced in the main library. +# Duplicate code from cleaners.core, +# not sure we want this functionality introduced in the main library. def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: """Removes punctuation from a given string.""" @@ -76,7 +77,7 @@ def bag_of_words(text: str) -> dict: i = 0 while i < len(words): if len(words[i]) > 1: - if words[i] in bow.keys(): + if words[i] in bow: bow[words[i]] += 1 else: bow[words[i]] = 1 @@ -89,7 +90,7 @@ def bag_of_words(text: str) -> dict: j += 1 if len(incorrect_word) == 1: - if incorrect_word in bow.keys(): + if incorrect_word in bow: bow[incorrect_word] += 1 else: bow[incorrect_word] = 1 From c5128fc2fc529d5f01edf17e24ac26567aa1e767 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 16:29:37 -0500 Subject: [PATCH 17/29] fix typing --- unstructured/metrics/text_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 36e21b9322..571b6ed87d 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -70,7 +70,7 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: return s -def bag_of_words(text: str) -> dict: +def bag_of_words(text: str) -> dict[str, int]: bow = {} words = remove_punctuation(text.lower(), ["-", "'"]).split() From f1d32cbfbaf5596bee2b47bc3084bb911d3e011b Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 16:36:20 -0500 Subject: [PATCH 18/29] restore core.py file --- unstructured/cleaners/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 00dbfcc6a6..ba7ec592db 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -296,9 +296,7 @@ def replace_unicode_quotes(text) -> str: ) -def remove_punctuation( - s: str, -) -> str: +def remove_punctuation(s: str) -> str: """Removes punctuation from a given string.""" s = s.translate(tbl) return s From fbd1abb4d94d3fc81dcace2f13ca5a4550d97227 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 17:54:20 -0500 Subject: [PATCH 19/29] correct typing --- unstructured/metrics/text_extraction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 571b6ed87d..880fce790d 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,6 +1,6 @@ import sys import unicodedata -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple from rapidfuzz.distance import Levenshtein @@ -70,8 +70,9 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: return s -def bag_of_words(text: str) -> dict[str, int]: - bow = {} +def bag_of_words(text: str) -> Dict[str, int]: + bow = Dict[str, int] + incorrect_word: str words = remove_punctuation(text.lower(), ["-", "'"]).split() i = 0 @@ -85,6 +86,7 @@ def bag_of_words(text: str) -> dict[str, int]: else: j = i incorrect_word = "" + while j < len(words) and len(words[j]) == 1: incorrect_word += words[j] j += 1 From 58a670a38a5df5791db99be3af51647f7c6c9358 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 18:07:45 -0500 Subject: [PATCH 20/29] fix syntax --- .../metrics/test_text_extraction.py | 63 +++++++++++++++---- unstructured/metrics/text_extraction.py | 4 +- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 691a607845..56a53d3617 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -2,7 +2,7 @@ import pytest -from unstructured.metrics.text_extraction import calculate_edit_distance +from unstructured.metrics import text_extraction from unstructured.partition.auto import partition @@ -16,32 +16,71 @@ def test_calculate_edit_distance(): source_cct_addn_char = "I like pizza. I like beagles." source_cct_dup_word = "I like pizza pizza. I like bagels." - assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0 assert ( - round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2) + round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) + == 1.0 + ) + assert ( + round( + text_extraction.calculate_edit_distance( + source_cct_word_space, source_cct, return_as="score" + ), + 2, + ) == 0.75 ) assert ( - round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39 + round( + text_extraction.calculate_edit_distance( + source_cct_spaces, source_cct, return_as="score" + ), + 2, + ) + == 0.39 ) assert ( - round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_no_space, source_cct, return_as="score" + ), + 2, + ) == 0.64 ) assert ( - round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_one_sentence, source_cct, return_as="score" + ), + 2, + ) == 0.0 ) assert ( - round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_missing_word, source_cct, return_as="score" + ), + 2, + ) == 0.57 ) assert ( - round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_addn_char, source_cct, return_as="score" + ), + 2, + ) == 0.89 ) assert ( - round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_dup_word, source_cct, return_as="score" + ), + 2, + ) == 0.79 ) @@ -59,8 +98,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte elements = partition(filename=f"example-docs/{filename}") output_cct = "\n".join([str(el) for el in elements]) - score = calculate_edit_distance(output_cct, source_cct, return_as="score") - distance = calculate_edit_distance(output_cct, source_cct, return_as="distance") + score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") + distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") assert score >= 0 assert score <= 1.0 @@ -87,4 +126,4 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte ], ) def test_bag_of_words(text, expected): - assert core.bag_of_words(text) == expected + assert text_extraction.bag_of_words(text) == expected diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 880fce790d..79d5d1e0a1 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -71,8 +71,8 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: def bag_of_words(text: str) -> Dict[str, int]: - bow = Dict[str, int] - incorrect_word: str + bow: Dict[str, int] = {} + incorrect_word: str = "" words = remove_punctuation(text.lower(), ["-", "'"]).split() i = 0 From dcd053f69c6c6c390e0c53588bf237647fa9b9a9 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 18:35:01 -0500 Subject: [PATCH 21/29] add new condition --- test_unstructured/metrics/test_text_extraction.py | 15 +++++++++++++++ unstructured/metrics/text_extraction.py | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 56a53d3617..358238d9ee 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -123,6 +123,21 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "I have a dog and a cat, I love my dog.", {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, ), + ( + "My dog's hair is red, but the dogs' houses are blue.", + { + "my": 1, + "dog's": 1, + "hair": 1, + "is": 2, + "red": 1, + "but": 1, + "the": 1, + "dogs": 1, + "house": 1, + "blue": 1, + }, + ), ], ) def test_bag_of_words(text, expected): diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 79d5d1e0a1..027a047d44 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -4,6 +4,8 @@ from rapidfuzz.distance import Levenshtein +from unstructured.nlp.patterns import ENDS_IN_PUNCT_RE + def calculate_edit_distance( output: str, @@ -75,6 +77,12 @@ def bag_of_words(text: str) -> Dict[str, int]: incorrect_word: str = "" words = remove_punctuation(text.lower(), ["-", "'"]).split() + # Remove remaining punctuation + for idx in range(len(words)): + punct = ENDS_IN_PUNCT_RE.findall(words[idx]) + if punct: + words[idx] = words[idx].replace(punct[0], "") + i = 0 while i < len(words): if len(words[i]) > 1: From e86da521a6196c7ea0a5863d66306a4a08cba033 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 18:41:00 -0500 Subject: [PATCH 22/29] remove additional code --- test_unstructured/metrics/test_text_extraction.py | 2 +- unstructured/metrics/text_extraction.py | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 358238d9ee..735e7af2da 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -133,7 +133,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "red": 1, "but": 1, "the": 1, - "dogs": 1, + "dog's": 1, "house": 1, "blue": 1, }, diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 027a047d44..57fe29be77 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,11 +1,10 @@ import sys import unicodedata + from typing import Dict, Optional, Tuple from rapidfuzz.distance import Levenshtein -from unstructured.nlp.patterns import ENDS_IN_PUNCT_RE - def calculate_edit_distance( output: str, @@ -77,12 +76,6 @@ def bag_of_words(text: str) -> Dict[str, int]: incorrect_word: str = "" words = remove_punctuation(text.lower(), ["-", "'"]).split() - # Remove remaining punctuation - for idx in range(len(words)): - punct = ENDS_IN_PUNCT_RE.findall(words[idx]) - if punct: - words[idx] = words[idx].replace(punct[0], "") - i = 0 while i < len(words): if len(words[i]) > 1: From 88ba596c5c18bd2e0ddcdbfd9a7c73c882fba576 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 21:21:50 -0500 Subject: [PATCH 23/29] removes hypens at the beginning of sentence --- .../metrics/test_text_extraction.py | 2 +- unstructured/cleaners/core.py | 11 ++++++++- unstructured/metrics/text_extraction.py | 24 +++---------------- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 735e7af2da..225a264791 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -133,7 +133,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "red": 1, "but": 1, "the": 1, - "dog's": 1, + "dogs'": 1, "house": 1, "blue": 1, }, diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index ba7ec592db..e5df8df0ce 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,7 +2,7 @@ import re import sys import unicodedata -from typing import Tuple +from typing import Optional, Tuple import numpy as np @@ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str: return s +def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: + tbl_new = tbl.copy() + if exclude_punctuation: + for punct in exclude_punctuation: + del tbl_new[ord(punct)] + s = s.translate(tbl_new) + return s + + def clean_extra_whitespace(text: str) -> str: """Cleans extra whitespace characters that appear between words. diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 57fe29be77..aba88c2178 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,9 +1,7 @@ -import sys -import unicodedata - -from typing import Dict, Optional, Tuple +from typing import Dict, Tuple from rapidfuzz.distance import Levenshtein +from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation def calculate_edit_distance( @@ -55,26 +53,10 @@ def calculate_edit_distance( return 0.0 -# Duplicate code from cleaners.core, -# not sure we want this functionality introduced in the main library. -def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: - """Removes punctuation from a given string.""" - - tbl = dict.fromkeys( - i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") - ) - - if exclude_punctuation: - for punct in exclude_punctuation: - del tbl[ord(punct)] - s = s.translate(tbl) - return s - - def bag_of_words(text: str) -> Dict[str, int]: bow: Dict[str, int] = {} incorrect_word: str = "" - words = remove_punctuation(text.lower(), ["-", "'"]).split() + words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split() i = 0 while i < len(words): From bd4620398c742e81cf394873ee4e5938f12fc13a Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Mon, 9 Oct 2023 21:22:51 -0500 Subject: [PATCH 24/29] formatted --- .../metrics/test_text_extraction.py | 28 ++++++++++++++----- unstructured/metrics/text_extraction.py | 1 + 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 225a264791..f830a6ce9b 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -23,7 +23,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_word_space, source_cct, return_as="score" + source_cct_word_space, + source_cct, + return_as="score", ), 2, ) @@ -32,7 +34,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_spaces, source_cct, return_as="score" + source_cct_spaces, + source_cct, + return_as="score", ), 2, ) @@ -41,7 +45,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_no_space, source_cct, return_as="score" + source_cct_no_space, + source_cct, + return_as="score", ), 2, ) @@ -50,7 +56,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_one_sentence, source_cct, return_as="score" + source_cct_one_sentence, + source_cct, + return_as="score", ), 2, ) @@ -59,7 +67,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_missing_word, source_cct, return_as="score" + source_cct_missing_word, + source_cct, + return_as="score", ), 2, ) @@ -68,7 +78,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_addn_char, source_cct, return_as="score" + source_cct_addn_char, + source_cct, + return_as="score", ), 2, ) @@ -77,7 +89,9 @@ def test_calculate_edit_distance(): assert ( round( text_extraction.calculate_edit_distance( - source_cct_dup_word, source_cct, return_as="score" + source_cct_dup_word, + source_cct, + return_as="score", ), 2, ) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index aba88c2178..24aab5edce 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,6 +1,7 @@ from typing import Dict, Tuple from rapidfuzz.distance import Levenshtein + from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation From 1838b9569f2cac7aff809e7547a3fde81aea04b9 Mon Sep 17 00:00:00 2001 From: Shreya Nidadavolu Date: Tue, 10 Oct 2023 00:58:30 -0700 Subject: [PATCH 25/29] adding test for dash and hyphen --- .../metrics/test_text_extraction.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index f830a6ce9b..ed6874faac 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -152,6 +152,27 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "blue": 1, }, ), + ( + "Sometimes sentences have a dash - like this one! A hyphen connects 2 words with no gap: easy-peasy.", + { + "sometimes": 1, + "sentences": 1, + "have": 1, + "a": 2, + "dash": 1, + "like": 1, + "this": 1, + "one": 1, + "hyphen": 1, + "connects": 1, + "2": 1, + "words": 1, + "with": 1, + "no": 1, + "gap": 1, + "easy-peasy": 1, + }, + ), ], ) def test_bag_of_words(text, expected): From 128ea22abc58ba734d5cf7d7fe0fc8935281b9e7 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Tue, 10 Oct 2023 09:16:37 -0500 Subject: [PATCH 26/29] add test --- .../metrics/test_text_extraction.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index f830a6ce9b..8396763054 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -137,6 +137,22 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "I have a dog and a cat, I love my dog.", {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, ), + ( + "-First I'd like to install the anti-theft lock on my car.", + { + "first": 1, + "i'd": 1, + "like": 1, + "to": 1, + "install": 1, + "the": 1, + "anti-theft": 1, + "lock": 1, + "on": 1, + "my": 1, + "car": 1, + }, + ), ( "My dog's hair is red, but the dogs' houses are blue.", { From 8dd9b06aed4bfeaed4d51e96b2975fbb96adb4bb Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Tue, 10 Oct 2023 09:38:46 -0500 Subject: [PATCH 27/29] removed test --- .../metrics/test_text_extraction.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index b244e8c81d..2320c8e55e 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -137,22 +137,6 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "I have a dog and a cat, I love my dog.", {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, ), - ( - "-First I'd like to install the anti-theft lock on my car.", - { - "first": 1, - "i'd": 1, - "like": 1, - "to": 1, - "install": 1, - "the": 1, - "anti-theft": 1, - "lock": 1, - "on": 1, - "my": 1, - "car": 1, - }, - ), ( "My dog's hair is red, but the dogs' houses are blue.", { @@ -169,7 +153,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte }, ), ( - "Sometimes sentences have a dash - like this one! A hyphen connects 2 words with no gap: easy-peasy.", + """Sometimes sentences have a dash - like this one! + A hyphen connects 2 words with no gap: easy-peasy.""", { "sometimes": 1, "sentences": 1, From 999cfc854d0edb699f09caa90ece2962657419aa Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Tue, 10 Oct 2023 11:05:15 -0500 Subject: [PATCH 28/29] fix logic to remove punctuation with spaces around it. --- test_unstructured/metrics/test_text_extraction.py | 3 ++- unstructured/metrics/text_extraction.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 2320c8e55e..b25bd3ef35 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -148,7 +148,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "but": 1, "the": 1, "dogs'": 1, - "house": 1, + "houses": 1, + "are": 1, "blue": 1, }, ), diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 24aab5edce..39d1c395a9 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -75,7 +75,7 @@ def bag_of_words(text: str) -> Dict[str, int]: incorrect_word += words[j] j += 1 - if len(incorrect_word) == 1: + if len(incorrect_word) == 1 and words[i].isalnum(): if incorrect_word in bow: bow[incorrect_word] += 1 else: From adfec61ea1279f4f75527726259e0928129561c6 Mon Sep 17 00:00:00 2001 From: Mallori Harrell Date: Tue, 10 Oct 2023 11:30:09 -0500 Subject: [PATCH 29/29] fix test --- test_unstructured/metrics/test_text_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index b25bd3ef35..5be9753428 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -143,7 +143,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte "my": 1, "dog's": 1, "hair": 1, - "is": 2, + "is": 1, "red": 1, "but": 1, "the": 1,