From fe1767d019179b03ac4c2746d2ec924ad02d2c54 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 28 Sep 2023 10:55:21 -0500
Subject: [PATCH 01/29] local embedding model from huggingface

---
 unstructured/embed/huggingface.py | 72 +++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 unstructured/embed/huggingface.py

diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py
new file mode 100644
index 0000000000..6df0911fd4
--- /dev/null
+++ b/unstructured/embed/huggingface.py
@@ -0,0 +1,72 @@
+import types
+from typing import List, Optional
+
+import numpy as np
+
+from unstructured.documents.elements import (
+    Element,
+)
+from unstructured.embed.interfaces import BaseEmbeddingEncoder
+from unstructured.ingest.error import EmbeddingEncoderConnectionError
+from unstructured.utils import requires_dependencies
+
+
+class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
+    def __init__(
+        self,
+        model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2",
+        model_kwargs: Optional[dict] = {"device": "cpu"},
+        encode_kwargs: Optional[dict] = {"normalize_embeddings": False},
+    ):
+        self.model_name = model_name
+        self.model_kwargs = model_kwargs
+        self.encode_kwargs = encode_kwargs
+        self.initialize()
+
+    @EmbeddingEncoderConnectionError.wrap
+    @requires_dependencies(
+        ["langchain", "huggingface", "tiktoken"],
+        extras="huggingface",
+    )
+    def initialize(self):
+        """Creates a langchain HuggingFace object to embed elements."""
+        from langchain.embeddings import HuggingFaceEmbeddings
+
+        self.hf = HuggingFaceEmbeddings(
+            model=self.model_name,
+            model_kwargs=self.model_kwargs,
+            encode_kwargs=self.encode_kwargs,
+        )
+        self.examplary_embedding = self.hf.embed_query("Q")
+
+        return self.hf
+
+    def num_of_dimensions(self):
+        return np.shape(self.examplary_embedding)
+
+    def is_unit_vector(self):
+        return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
+
+    def embed_query(self, query):
+        return self.hf.embed_documents([str(query)])
+
+    def embed_documents(self, elements: List[Element]) -> List[Element]:
+        embeddings = self.hf.embed_documents([str(e) for e in elements])
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+
+    def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+
+        for i, element in enumerate(elements):
+            original_method = element.to_dict
+
+            def new_to_dict(self):
+                d = original_method()
+                d["embeddings"] = self.embeddings
+                return d
+
+            element.embeddings = embeddings[i]
+            elements_w_embedding.append(element)
+            element.to_dict = types.MethodType(new_to_dict, element)

From 672bc8d0c7def7b103cb5893fa210be581e18327 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 2 Oct 2023 15:36:31 -0500
Subject: [PATCH 02/29] add arguments

---
 unstructured/embed/huggingface.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py
index 6df0911fd4..45df7ba1df 100644
--- a/unstructured/embed/huggingface.py
+++ b/unstructured/embed/huggingface.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 
+from langchain.embeddings import HuggingFaceEmbeddings
+
 from unstructured.documents.elements import (
     Element,
 )
@@ -14,29 +16,25 @@
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     def __init__(
         self,
-        model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2",
+        model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2",
         model_kwargs: Optional[dict] = {"device": "cpu"},
         encode_kwargs: Optional[dict] = {"normalize_embeddings": False},
     ):
         self.model_name = model_name
         self.model_kwargs = model_kwargs
         self.encode_kwargs = encode_kwargs
+
         self.initialize()
 
-    @EmbeddingEncoderConnectionError.wrap
-    @requires_dependencies(
-        ["langchain", "huggingface", "tiktoken"],
-        extras="huggingface",
-    )
     def initialize(self):
         """Creates a langchain HuggingFace object to embed elements."""
-        from langchain.embeddings import HuggingFaceEmbeddings
 
         self.hf = HuggingFaceEmbeddings(
-            model=self.model_name,
+            model_name=self.model_name,
             model_kwargs=self.model_kwargs,
             encode_kwargs=self.encode_kwargs,
         )
+
         self.examplary_embedding = self.hf.embed_query("Q")
 
         return self.hf

From a6f9fbbedffb974cb2aeb28743ab4b498c10734d Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Tue, 3 Oct 2023 12:01:08 -0500
Subject: [PATCH 03/29] begin coding bag of words

---
 unstructured/cleaners/core.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index ba7ec592db..1ddc1a3200 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 
+import nltkfrom nltk.corpus import stopwords
+
 from unstructured.file_utils.encoding import (
     format_encoding_str,
 )
@@ -458,3 +460,15 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
 
 def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
     return int(index - moved_indices[index])
+
+
+def bag_of_words(text: str, remove_stop_words: bool) -> List[Text]:
+    words = set(remove_punctuation(text.lower()).split())
+
+    clean_words = []
+    if remove_stop_words:
+        for word in words:
+            if word not in stopwords.words('english'):
+                clean_words.append(word)
+
+    
\ No newline at end of file

From 8511de1ec9eb7a96b311d5aba3d325a720565bd6 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 09:46:54 -0500
Subject: [PATCH 04/29] bag of words function

---
 unstructured/cleaners/core.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 1ddc1a3200..39fe0b5684 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -2,12 +2,11 @@
 import re
 import sys
 import unicodedata
+from collections import Counter
 from typing import Tuple
 
 import numpy as np
 
-import nltkfrom nltk.corpus import stopwords
-
 from unstructured.file_utils.encoding import (
     format_encoding_str,
 )
@@ -462,13 +461,7 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
     return int(index - moved_indices[index])
 
 
-def bag_of_words(text: str, remove_stop_words: bool) -> List[Text]:
-    words = set(remove_punctuation(text.lower()).split())
-
-    clean_words = []
-    if remove_stop_words:
-        for word in words:
-            if word not in stopwords.words('english'):
-                clean_words.append(word)
-
+def bag_of_words(text: str) -> dict[str]:
+    words = remove_punctuation(text.lower()).split()
+    return dict(Counter(words))
     
\ No newline at end of file

From 2722e09d0ee27b102e6cb2155d196e5f93bffa33 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 11:38:08 -0500
Subject: [PATCH 05/29] fix syntax

---
 test_unstructured/cleaners/test_core.py | 13 +++++++++++++
 unstructured/cleaners/core.py           |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index eec8edd2b9..579fc0d2f2 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -300,3 +300,16 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc
 def test_bytes_string_to_string():
     text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
     assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (
+            "The dog loved the cat, but the cat loved the cow",
+            {"the": 4, "cat": 1, "loved": 2, "dog": 2, "but": 1, "cow": 1},
+        ),
+    ],
+)
+def test_bag_of_words(text, expected):
+    assert core.bag_of_words(text) == expected
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 39fe0b5684..f23d930e27 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -461,7 +461,7 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
     return int(index - moved_indices[index])
 
 
-def bag_of_words(text: str) -> dict[str]:
+def bag_of_words(text: str) -> dict:
     words = remove_punctuation(text.lower()).split()
     return dict(Counter(words))
     
\ No newline at end of file

From ed42bc18911b959f382770da5c880239d83bec76 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 11:55:16 -0500
Subject: [PATCH 06/29] format

---
 unstructured/cleaners/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index f23d930e27..f589674c2f 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -464,4 +464,3 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
 def bag_of_words(text: str) -> dict:
     words = remove_punctuation(text.lower()).split()
     return dict(Counter(words))
-    
\ No newline at end of file

From 332c70adf0dd82cdfce7293214b087f05879c99b Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 11:59:41 -0500
Subject: [PATCH 07/29] remove unwanted file

---
 unstructured/embed/huggingface.py | 70 -------------------------------
 1 file changed, 70 deletions(-)
 delete mode 100644 unstructured/embed/huggingface.py

diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py
deleted file mode 100644
index 45df7ba1df..0000000000
--- a/unstructured/embed/huggingface.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import types
-from typing import List, Optional
-
-import numpy as np
-
-from langchain.embeddings import HuggingFaceEmbeddings
-
-from unstructured.documents.elements import (
-    Element,
-)
-from unstructured.embed.interfaces import BaseEmbeddingEncoder
-from unstructured.ingest.error import EmbeddingEncoderConnectionError
-from unstructured.utils import requires_dependencies
-
-
-class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
-    def __init__(
-        self,
-        model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2",
-        model_kwargs: Optional[dict] = {"device": "cpu"},
-        encode_kwargs: Optional[dict] = {"normalize_embeddings": False},
-    ):
-        self.model_name = model_name
-        self.model_kwargs = model_kwargs
-        self.encode_kwargs = encode_kwargs
-
-        self.initialize()
-
-    def initialize(self):
-        """Creates a langchain HuggingFace object to embed elements."""
-
-        self.hf = HuggingFaceEmbeddings(
-            model_name=self.model_name,
-            model_kwargs=self.model_kwargs,
-            encode_kwargs=self.encode_kwargs,
-        )
-
-        self.examplary_embedding = self.hf.embed_query("Q")
-
-        return self.hf
-
-    def num_of_dimensions(self):
-        return np.shape(self.examplary_embedding)
-
-    def is_unit_vector(self):
-        return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
-
-    def embed_query(self, query):
-        return self.hf.embed_documents([str(query)])
-
-    def embed_documents(self, elements: List[Element]) -> List[Element]:
-        embeddings = self.hf.embed_documents([str(e) for e in elements])
-        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
-        return elements_with_embeddings
-
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-
-        for i, element in enumerate(elements):
-            original_method = element.to_dict
-
-            def new_to_dict(self):
-                d = original_method()
-                d["embeddings"] = self.embeddings
-                return d
-
-            element.embeddings = embeddings[i]
-            elements_w_embedding.append(element)
-            element.to_dict = types.MethodType(new_to_dict, element)

From 81ba8759eb00ad7846014ec51d471ab5848660b6 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 14:29:41 -0500
Subject: [PATCH 08/29] update changelog and version

---
 CHANGELOG.md                | 2 +-
 unstructured/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d5d1843b2..abc546c48a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.20-dev2
+## 0.10.20-dev3
 
 ### Enhancements
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 680eaf3a9a..57943dc290 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.20-dev2"  # pragma: no cover
+__version__ = "0.10.20-dev3"  # pragma: no cover

From c4114f7152ecb7c556a9b133e728dc21ab36a84c Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 14:54:42 -0500
Subject: [PATCH 09/29] fix test

---
 test_unstructured/cleaners/test_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index 579fc0d2f2..d78bc1f01c 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -307,7 +307,7 @@ def test_bytes_string_to_string():
     [
         (
             "The dog loved the cat, but the cat loved the cow",
-            {"the": 4, "cat": 1, "loved": 2, "dog": 2, "but": 1, "cow": 1},
+            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
         ),
     ],
 )

From 71b5656eaa810172ff088bf4a47d676bebf4715e Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 15:41:24 -0500
Subject: [PATCH 10/29] added test

---
 test_unstructured/cleaners/test_core.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index d78bc1f01c..bee1dfd350 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -309,6 +309,10 @@ def test_bytes_string_to_string():
             "The dog loved the cat, but the cat loved the cow",
             {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
         ),
+        (
+            "i n t r o d u c t i o n",
+            {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1},
+        ),
     ],
 )
 def test_bag_of_words(text, expected):

From 2e041198e70acbac4764b09d801a1eb05c04c6ce Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 17:29:53 -0500
Subject: [PATCH 11/29] redo logic for bag of words

---
 unstructured/cleaners/core.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index f589674c2f..4b44baab07 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -2,7 +2,6 @@
 import re
 import sys
 import unicodedata
-from collections import Counter
 from typing import Tuple
 
 import numpy as np
@@ -462,5 +461,27 @@ def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
 
 
 def bag_of_words(text: str) -> dict:
+    incorrect_word = ""
+    bow = {}
+
     words = remove_punctuation(text.lower()).split()
-    return dict(Counter(words))
+    i = 0
+    while i < len(words):
+        if len(words[i]) > 1:
+            if words[i] in bow.keys():
+                bow[words[i]] += 1
+            else:
+                bow[words[i]] = 1
+            i += 1
+        else:
+            j = i
+            while j < len(words) and len(words[j]) == 1:
+                incorrect_word += words[j]
+                j += 1
+            if len(incorrect_word) == 1:
+                bow[incorrect_word] = 1
+            else:
+                incorrect_word = " ".join(list(incorrect_word))
+                bow[incorrect_word] = 1
+            i = j
+    return bow

From 5d1769a4e2a67a12bd5342a6388683f62ad40a33 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Thu, 5 Oct 2023 17:34:09 -0500
Subject: [PATCH 12/29] update tests

---
 test_unstructured/cleaners/test_core.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index bee1dfd350..c04966d047 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -313,6 +313,10 @@ def test_bytes_string_to_string():
             "i n t r o d u c t i o n",
             {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1},
         ),
+        (
+            "Hello my name is H a r p e r, what's your name?",
+            {"hello": 1, "my": 1, "name": 2, "is": 1, "h a r p e r": 1, "whats": 1, "your": 1},
+        ),
     ],
 )
 def test_bag_of_words(text, expected):

From f8ecffad5009f9221f2df51021d5649c3d613955 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Fri, 6 Oct 2023 09:19:44 -0500
Subject: [PATCH 13/29] remove funky words

---
 test_unstructured/cleaners/test_core.py |  6 +-----
 unstructured/cleaners/core.py           | 11 +----------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index c04966d047..85c345db10 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -309,13 +309,9 @@ def test_bytes_string_to_string():
             "The dog loved the cat, but the cat loved the cow",
             {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
         ),
-        (
-            "i n t r o d u c t i o n",
-            {"i": 2, "n": 2, "t": 2, "o": 2, "r": 1, "d": 1, "u": 1, "c": 1},
-        ),
         (
             "Hello my name is H a r p e r, what's your name?",
-            {"hello": 1, "my": 1, "name": 2, "is": 1, "h a r p e r": 1, "whats": 1, "your": 1},
+            {"hello": 1, "my": 1, "name": 2, "is": 1, "whats": 1, "your": 1},
         ),
     ],
 )
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 4b44baab07..17a20d3589 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -474,14 +474,5 @@ def bag_of_words(text: str) -> dict:
                 bow[words[i]] = 1
             i += 1
         else:
-            j = i
-            while j < len(words) and len(words[j]) == 1:
-                incorrect_word += words[j]
-                j += 1
-            if len(incorrect_word) == 1:
-                bow[incorrect_word] = 1
-            else:
-                incorrect_word = " ".join(list(incorrect_word))
-                bow[incorrect_word] = 1
-            i = j
+            i += 1
     return bow

From 010477a47b01a03eaf5b21f98f17cdabc6207697 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Fri, 6 Oct 2023 10:19:43 -0500
Subject: [PATCH 14/29] update version

---
 CHANGELOG.md                | 4 +++-
 unstructured/__version__.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe638b4aff..744defb7fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.20-dev3
+## 0.10.20-dev5
 
 ### Enhancements
 
@@ -9,6 +9,8 @@
 
 ### Features
 
+* **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy.
+
 ### Fixes
 
 * **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page.
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 57943dc290..adcfc625cb 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.20-dev3"  # pragma: no cover
+__version__ = "0.10.20-dev5"  # pragma: no cover

From b36a310df864a8d770bdd6fec0939a5f8d6ba312 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 15:15:36 -0500
Subject: [PATCH 15/29] fix bag of words and move code to correct files

---
 test_unstructured/cleaners/test_core.py       | 17 -------
 .../metrics/test_text_extraction.py           | 21 +++++++++
 unstructured/cleaners/core.py                 | 22 ++-------
 unstructured/metrics/text_extraction.py       | 47 ++++++++++++++++++-
 4 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index 85c345db10..eec8edd2b9 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -300,20 +300,3 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc
 def test_bytes_string_to_string():
     text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
     assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"
-
-
-@pytest.mark.parametrize(
-    ("text", "expected"),
-    [
-        (
-            "The dog loved the cat, but the cat loved the cow",
-            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
-        ),
-        (
-            "Hello my name is H a r p e r, what's your name?",
-            {"hello": 1, "my": 1, "name": 2, "is": 1, "whats": 1, "your": 1},
-        ),
-    ],
-)
-def test_bag_of_words(text, expected):
-    assert core.bag_of_words(text) == expected
diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 73bce5bd6f..691a607845 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -67,3 +67,24 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
     assert distance >= 0
     assert round(score, 2) == expected_score
     assert distance == expected_distance
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (
+            "The dog loved the cat, but the cat loved the cow",
+            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
+        ),
+        (
+            "Hello my name is H a r p e r, what's your name?",
+            {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
+        ),
+        (
+            "I have a dog and a cat, I love my dog.",
+            {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
+        ),
+    ],
+)
+def test_bag_of_words(text, expected):
+    assert core.bag_of_words(text) == expected
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 17a20d3589..00dbfcc6a6 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -296,7 +296,9 @@ def replace_unicode_quotes(text) -> str:
 )
 
 
-def remove_punctuation(s: str) -> str:
+def remove_punctuation(
+    s: str,
+) -> str:
     """Removes punctuation from a given string."""
     s = s.translate(tbl)
     return s
@@ -458,21 +460,3 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
 
 def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int:
     return int(index - moved_indices[index])
-
-
-def bag_of_words(text: str) -> dict:
-    incorrect_word = ""
-    bow = {}
-
-    words = remove_punctuation(text.lower()).split()
-    i = 0
-    while i < len(words):
-        if len(words[i]) > 1:
-            if words[i] in bow.keys():
-                bow[words[i]] += 1
-            else:
-                bow[words[i]] = 1
-            i += 1
-        else:
-            i += 1
-    return bow
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 001bf1bd11..0fac8c23d6 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,4 +1,6 @@
-from typing import Tuple
+import sys
+import unicodedata
+from typing import Tuple, Optional
 
 from rapidfuzz.distance import Levenshtein
 
@@ -50,3 +52,46 @@ def calculate_edit_distance(
     elif return_as == "distance":
         return distance
     return 0.0
+
+
+# Duplicate code from cleaners.core, not sure we want this functionality introduced in the main library.
+def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
+    """Removes punctuation from a given string."""
+
+    tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    if exclude_punctuation:
+        for punct in exclude_punctuation:
+            del tbl[ord(punct)]
+    s = s.translate(tbl)
+    return s
+
+
+def bag_of_words(text: str) -> dict:
+    bow = {}
+    words = remove_punctuation(text.lower(), ["-", "'"]).split()
+
+    i = 0
+    while i < len(words):
+        if len(words[i]) > 1:
+            if words[i] in bow.keys():
+                bow[words[i]] += 1
+            else:
+                bow[words[i]] = 1
+            i += 1
+        else:
+            j = i
+            incorrect_word = ""
+            while j < len(words) and len(words[j]) == 1:
+                incorrect_word += words[j]
+                j += 1
+
+            if len(incorrect_word) == 1:
+                if incorrect_word in bow.keys():
+                    bow[incorrect_word] += 1
+                else:
+                    bow[incorrect_word] = 1
+            i = j
+    return bow

From 7e0605457ed1d64704ddc80993b8a1ce76412a5c Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 15:49:36 -0500
Subject: [PATCH 16/29] formatting

---
 unstructured/metrics/text_extraction.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 0fac8c23d6..36e21b9322 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,6 +1,6 @@
 import sys
 import unicodedata
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
 from rapidfuzz.distance import Levenshtein
 
@@ -54,7 +54,8 @@ def calculate_edit_distance(
     return 0.0
 
 
-# Duplicate code from cleaners.core, not sure we want this functionality introduced in the main library.
+# Duplicate code from cleaners.core,
+# not sure we want this functionality introduced in the main library.
 def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
     """Removes punctuation from a given string."""
 
@@ -76,7 +77,7 @@ def bag_of_words(text: str) -> dict:
     i = 0
     while i < len(words):
         if len(words[i]) > 1:
-            if words[i] in bow.keys():
+            if words[i] in bow:
                 bow[words[i]] += 1
             else:
                 bow[words[i]] = 1
@@ -89,7 +90,7 @@ def bag_of_words(text: str) -> dict:
                 j += 1
 
             if len(incorrect_word) == 1:
-                if incorrect_word in bow.keys():
+                if incorrect_word in bow:
                     bow[incorrect_word] += 1
                 else:
                     bow[incorrect_word] = 1

From c5128fc2fc529d5f01edf17e24ac26567aa1e767 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 16:29:37 -0500
Subject: [PATCH 17/29] fix typing

---
 unstructured/metrics/text_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 36e21b9322..571b6ed87d 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -70,7 +70,7 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
     return s
 
 
-def bag_of_words(text: str) -> dict:
+def bag_of_words(text: str) -> dict[str, int]:
     bow = {}
     words = remove_punctuation(text.lower(), ["-", "'"]).split()
 

From f1d32cbfbaf5596bee2b47bc3084bb911d3e011b Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 16:36:20 -0500
Subject: [PATCH 18/29] restore core.py file

---
 unstructured/cleaners/core.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 00dbfcc6a6..ba7ec592db 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -296,9 +296,7 @@ def replace_unicode_quotes(text) -> str:
 )
 
 
-def remove_punctuation(
-    s: str,
-) -> str:
+def remove_punctuation(s: str) -> str:
     """Removes punctuation from a given string."""
     s = s.translate(tbl)
     return s

From fbd1abb4d94d3fc81dcace2f13ca5a4550d97227 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 17:54:20 -0500
Subject: [PATCH 19/29] correct typing

---
 unstructured/metrics/text_extraction.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 571b6ed87d..880fce790d 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,6 +1,6 @@
 import sys
 import unicodedata
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 from rapidfuzz.distance import Levenshtein
 
@@ -70,8 +70,9 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
     return s
 
 
-def bag_of_words(text: str) -> dict[str, int]:
-    bow = {}
+def bag_of_words(text: str) -> Dict[str, int]:
+    bow = Dict[str, int]
+    incorrect_word: str
     words = remove_punctuation(text.lower(), ["-", "'"]).split()
 
     i = 0
@@ -85,6 +86,7 @@ def bag_of_words(text: str) -> dict[str, int]:
         else:
             j = i
             incorrect_word = ""
+
             while j < len(words) and len(words[j]) == 1:
                 incorrect_word += words[j]
                 j += 1

From 58a670a38a5df5791db99be3af51647f7c6c9358 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 18:07:45 -0500
Subject: [PATCH 20/29] fix syntax

---
 .../metrics/test_text_extraction.py           | 63 +++++++++++++++----
 unstructured/metrics/text_extraction.py       |  4 +-
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 691a607845..56a53d3617 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from unstructured.metrics.text_extraction import calculate_edit_distance
+from unstructured.metrics import text_extraction
 from unstructured.partition.auto import partition
 
 
@@ -16,32 +16,71 @@ def test_calculate_edit_distance():
     source_cct_addn_char = "I like pizza. I like beagles."
     source_cct_dup_word = "I like pizza pizza. I like bagels."
 
-    assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0
     assert (
-        round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2)
+        round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
+        == 1.0
+    )
+    assert (
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_word_space, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.75
     )
     assert (
-        round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_spaces, source_cct, return_as="score"
+            ),
+            2,
+        )
+        == 0.39
     )
     assert (
-        round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_no_space, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.64
     )
     assert (
-        round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_one_sentence, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.0
     )
     assert (
-        round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_missing_word, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.57
     )
     assert (
-        round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_addn_char, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.89
     )
     assert (
-        round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_dup_word, source_cct, return_as="score"
+            ),
+            2,
+        )
         == 0.79
     )
 
@@ -59,8 +98,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
     elements = partition(filename=f"example-docs/{filename}")
     output_cct = "\n".join([str(el) for el in elements])
 
-    score = calculate_edit_distance(output_cct, source_cct, return_as="score")
-    distance = calculate_edit_distance(output_cct, source_cct, return_as="distance")
+    score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
+    distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
 
     assert score >= 0
     assert score <= 1.0
@@ -87,4 +126,4 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
     ],
 )
 def test_bag_of_words(text, expected):
-    assert core.bag_of_words(text) == expected
+    assert text_extraction.bag_of_words(text) == expected
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 880fce790d..79d5d1e0a1 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -71,8 +71,8 @@ def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
 
 
 def bag_of_words(text: str) -> Dict[str, int]:
-    bow = Dict[str, int]
-    incorrect_word: str
+    bow: Dict[str, int] = {}
+    incorrect_word: str = ""
     words = remove_punctuation(text.lower(), ["-", "'"]).split()
 
     i = 0

From dcd053f69c6c6c390e0c53588bf237647fa9b9a9 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 18:35:01 -0500
Subject: [PATCH 21/29] add new condition

---
 test_unstructured/metrics/test_text_extraction.py | 15 +++++++++++++++
 unstructured/metrics/text_extraction.py           |  8 ++++++++
 2 files changed, 23 insertions(+)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 56a53d3617..358238d9ee 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -123,6 +123,21 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
             "I have a dog and a cat, I love my dog.",
             {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
         ),
+        (
+            "My dog's hair is red, but the dogs' houses are blue.",
+            {
+                "my": 1,
+                "dog's": 1,
+                "hair": 1,
+                "is": 2,
+                "red": 1,
+                "but": 1,
+                "the": 1,
+                "dogs": 1,
+                "house": 1,
+                "blue": 1,
+            },
+        ),
     ],
 )
 def test_bag_of_words(text, expected):
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 79d5d1e0a1..027a047d44 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -4,6 +4,8 @@
 
 from rapidfuzz.distance import Levenshtein
 
+from unstructured.nlp.patterns import ENDS_IN_PUNCT_RE
+
 
 def calculate_edit_distance(
     output: str,
@@ -75,6 +77,12 @@ def bag_of_words(text: str) -> Dict[str, int]:
     incorrect_word: str = ""
     words = remove_punctuation(text.lower(), ["-", "'"]).split()
 
+    # Remove remaining punctuation
+    for idx in range(len(words)):
+        punct = ENDS_IN_PUNCT_RE.findall(words[idx])
+        if punct:
+            words[idx] = words[idx].replace(punct[0], "")
+
     i = 0
     while i < len(words):
         if len(words[i]) > 1:

From e86da521a6196c7ea0a5863d66306a4a08cba033 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 18:41:00 -0500
Subject: [PATCH 22/29] remove additional code

---
 test_unstructured/metrics/test_text_extraction.py | 2 +-
 unstructured/metrics/text_extraction.py           | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 358238d9ee..735e7af2da 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -133,7 +133,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
                 "red": 1,
                 "but": 1,
                 "the": 1,
-                "dogs": 1,
+                "dog's": 1,
                 "house": 1,
                 "blue": 1,
             },
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 027a047d44..57fe29be77 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,11 +1,10 @@
 import sys
 import unicodedata
+
 from typing import Dict, Optional, Tuple
 
 from rapidfuzz.distance import Levenshtein
 
-from unstructured.nlp.patterns import ENDS_IN_PUNCT_RE
-
 
 def calculate_edit_distance(
     output: str,
@@ -77,12 +76,6 @@ def bag_of_words(text: str) -> Dict[str, int]:
     incorrect_word: str = ""
     words = remove_punctuation(text.lower(), ["-", "'"]).split()
 
-    # Remove remaining punctuation
-    for idx in range(len(words)):
-        punct = ENDS_IN_PUNCT_RE.findall(words[idx])
-        if punct:
-            words[idx] = words[idx].replace(punct[0], "")
-
     i = 0
     while i < len(words):
         if len(words[i]) > 1:

From 88ba596c5c18bd2e0ddcdbfd9a7c73c882fba576 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 21:21:50 -0500
Subject: [PATCH 23/29] removes hypens at the beginning of sentence

---
 .../metrics/test_text_extraction.py           |  2 +-
 unstructured/cleaners/core.py                 | 11 ++++++++-
 unstructured/metrics/text_extraction.py       | 24 +++----------------
 3 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 735e7af2da..225a264791 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -133,7 +133,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
                 "red": 1,
                 "but": 1,
                 "the": 1,
-                "dog's": 1,
+                "dogs'": 1,
                 "house": 1,
                 "blue": 1,
             },
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index ba7ec592db..e5df8df0ce 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -2,7 +2,7 @@
 import re
 import sys
 import unicodedata
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 
@@ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str:
     return s
 
 
+def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
+    tbl_new = tbl.copy()
+    if exclude_punctuation:
+        for punct in exclude_punctuation:
+            del tbl_new[ord(punct)]
+    s = s.translate(tbl_new)
+    return s
+
+
 def clean_extra_whitespace(text: str) -> str:
     """Cleans extra whitespace characters that appear between words.
 
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 57fe29be77..aba88c2178 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,9 +1,7 @@
-import sys
-import unicodedata
-
-from typing import Dict, Optional, Tuple
+from typing import Dict, Tuple
 
 from rapidfuzz.distance import Levenshtein
+from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
 
 
 def calculate_edit_distance(
@@ -55,26 +53,10 @@ def calculate_edit_distance(
     return 0.0
 
 
-# Duplicate code from cleaners.core,
-# not sure we want this functionality introduced in the main library.
-def remove_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
-    """Removes punctuation from a given string."""
-
-    tbl = dict.fromkeys(
-        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
-    )
-
-    if exclude_punctuation:
-        for punct in exclude_punctuation:
-            del tbl[ord(punct)]
-    s = s.translate(tbl)
-    return s
-
-
 def bag_of_words(text: str) -> Dict[str, int]:
     bow: Dict[str, int] = {}
     incorrect_word: str = ""
-    words = remove_punctuation(text.lower(), ["-", "'"]).split()
+    words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split()
 
     i = 0
     while i < len(words):

From bd4620398c742e81cf394873ee4e5938f12fc13a Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Mon, 9 Oct 2023 21:22:51 -0500
Subject: [PATCH 24/29] formatted

---
 .../metrics/test_text_extraction.py           | 28 ++++++++++++++-----
 unstructured/metrics/text_extraction.py       |  1 +
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 225a264791..f830a6ce9b 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -23,7 +23,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_word_space, source_cct, return_as="score"
+                source_cct_word_space,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -32,7 +34,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_spaces, source_cct, return_as="score"
+                source_cct_spaces,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -41,7 +45,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_no_space, source_cct, return_as="score"
+                source_cct_no_space,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -50,7 +56,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_one_sentence, source_cct, return_as="score"
+                source_cct_one_sentence,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -59,7 +67,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_missing_word, source_cct, return_as="score"
+                source_cct_missing_word,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -68,7 +78,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_addn_char, source_cct, return_as="score"
+                source_cct_addn_char,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
@@ -77,7 +89,9 @@ def test_calculate_edit_distance():
     assert (
         round(
             text_extraction.calculate_edit_distance(
-                source_cct_dup_word, source_cct, return_as="score"
+                source_cct_dup_word,
+                source_cct,
+                return_as="score",
             ),
             2,
         )
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index aba88c2178..24aab5edce 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,6 +1,7 @@
 from typing import Dict, Tuple
 
 from rapidfuzz.distance import Levenshtein
+
 from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
 
 

From 1838b9569f2cac7aff809e7547a3fde81aea04b9 Mon Sep 17 00:00:00 2001
From: Shreya Nidadavolu <shreyanid9@gmail.com>
Date: Tue, 10 Oct 2023 00:58:30 -0700
Subject: [PATCH 25/29] adding test for dash and hyphen

---
 .../metrics/test_text_extraction.py           | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index f830a6ce9b..ed6874faac 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -152,6 +152,27 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
                 "blue": 1,
             },
         ),
+        (
+            "Sometimes sentences have a dash - like this one! A hyphen connects 2 words with no gap: easy-peasy.",
+            {
+                "sometimes": 1,
+                "sentences": 1,
+                "have": 1,
+                "a": 2,
+                "dash": 1,
+                "like": 1,
+                "this": 1,
+                "one": 1,
+                "hyphen": 1,
+                "connects": 1,
+                "2": 1,
+                "words": 1,
+                "with": 1,
+                "no": 1,
+                "gap": 1,
+                "easy-peasy": 1,
+            },
+        ),
     ],
 )
 def test_bag_of_words(text, expected):

From 128ea22abc58ba734d5cf7d7fe0fc8935281b9e7 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Tue, 10 Oct 2023 09:16:37 -0500
Subject: [PATCH 26/29] add test

---
 .../metrics/test_text_extraction.py              | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index f830a6ce9b..8396763054 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -137,6 +137,22 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
             "I have a dog and a cat, I love my dog.",
             {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
         ),
+        (
+            "-First I'd like to install the anti-theft lock on my car.",
+            {
+                "first": 1,
+                "i'd": 1,
+                "like": 1,
+                "to": 1,
+                "install": 1,
+                "the": 1,
+                "anti-theft": 1,
+                "lock": 1,
+                "on": 1,
+                "my": 1,
+                "car": 1,
+            },
+        ),
         (
             "My dog's hair is red, but the dogs' houses are blue.",
             {

From 8dd9b06aed4bfeaed4d51e96b2975fbb96adb4bb Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Tue, 10 Oct 2023 09:38:46 -0500
Subject: [PATCH 27/29] removed test

---
 .../metrics/test_text_extraction.py           | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index b244e8c81d..2320c8e55e 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -137,22 +137,6 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
             "I have a dog and a cat, I love my dog.",
             {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
         ),
-        (
-            "-First I'd like to install the anti-theft lock on my car.",
-            {
-                "first": 1,
-                "i'd": 1,
-                "like": 1,
-                "to": 1,
-                "install": 1,
-                "the": 1,
-                "anti-theft": 1,
-                "lock": 1,
-                "on": 1,
-                "my": 1,
-                "car": 1,
-            },
-        ),
         (
             "My dog's hair is red, but the dogs' houses are blue.",
             {
@@ -169,7 +153,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
             },
         ),
         (
-            "Sometimes sentences have a dash - like this one! A hyphen connects 2 words with no gap: easy-peasy.",
+            """Sometimes sentences have a dash - like this one!
+            A hyphen connects 2 words with no gap: easy-peasy.""",
             {
                 "sometimes": 1,
                 "sentences": 1,

From 999cfc854d0edb699f09caa90ece2962657419aa Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Tue, 10 Oct 2023 11:05:15 -0500
Subject: [PATCH 28/29] fix logic to remove punctuation with spaces around it.

---
 test_unstructured/metrics/test_text_extraction.py | 3 ++-
 unstructured/metrics/text_extraction.py           | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 2320c8e55e..b25bd3ef35 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -148,7 +148,8 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
                 "but": 1,
                 "the": 1,
                 "dogs'": 1,
-                "house": 1,
+                "houses": 1,
+                "are": 1,
                 "blue": 1,
             },
         ),
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 24aab5edce..39d1c395a9 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -75,7 +75,7 @@ def bag_of_words(text: str) -> Dict[str, int]:
                 incorrect_word += words[j]
                 j += 1
 
-            if len(incorrect_word) == 1:
+            if len(incorrect_word) == 1 and words[i].isalnum():
                 if incorrect_word in bow:
                     bow[incorrect_word] += 1
                 else:

From adfec61ea1279f4f75527726259e0928129561c6 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Date: Tue, 10 Oct 2023 11:30:09 -0500
Subject: [PATCH 29/29] fix test

---
 test_unstructured/metrics/test_text_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index b25bd3ef35..5be9753428 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -143,7 +143,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
                 "my": 1,
                 "dog's": 1,
                 "hair": 1,
-                "is": 2,
+                "is": 1,
                 "red": 1,
                 "but": 1,
                 "the": 1,