move apply_lang_metadata to lang.py to avoid adding unnecessary depen…

…dencies from metadata.py to partitioners
Unstructured-IO · Oct 6, 2023 · dd51858 · dd51858
1 parent d3faf31
commit dd51858
Show file tree

Hide file tree

Showing 13 changed files with 54 additions and 56 deletions.
diff --git a/unstructured/file_utils/metadata.py b/unstructured/file_utils/metadata.py
@@ -1,16 +1,13 @@
 import datetime
 import io
 from dataclasses import dataclass, field
-from typing import IO, Any, Dict, Final, Iterable, Iterator, List, Optional
+from typing import IO, Any, Dict, Final, Optional
 
 import docx
 import openpyxl
 from PIL import Image
 from PIL.ExifTags import TAGS
 
-from unstructured.documents.elements import Element
-from unstructured.partition.lang import detect_languages
-
 # NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
 EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"
 
@@ -153,43 +150,3 @@ def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime
     # using the standard EXIF datetime format
     except ValueError:
         return None
-
-
-def apply_lang_metadata(
-    elements: Iterable[Element],
-    languages: List[str],
-    detect_language_per_element: bool = False,
-) -> Iterator[Element]:
-    """Detect and apply metadata.languages to each element in `elements`."""
-    # -- Note this function has a stream interface, but reads the full `elements` stream into memory
-    # -- before emitting the first updated element as output.
-
-    # Skip language detection for partitioners that use other partitioners.
-    # For example, partition_msg relies on partition_html and partition_text, but the metadata
-    # gets overwritten after elements have been returned by _html and _text,
-    # so `languages` would be detected twice.
-    if languages == [""]:
-        yield from elements
-        return
-
-    if not isinstance(elements, List):
-        elements = list(elements)
-
-    full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
-    detected_languages = detect_languages(text=full_text, languages=languages)
-    if (
-        detected_languages is not None
-        and len(languages) == 1
-        and detect_language_per_element is False
-    ):
-        # -- apply detected language to each metadata --
-        for e in elements:
-            e.metadata.languages = detected_languages
-            yield e
-    else:
-        for e in elements:
-            if hasattr(e, "text"):
-                e.metadata.languages = detect_languages(e.text)
-                yield e
-            else:
-                yield e
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -12,13 +12,13 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
     spooled_to_bytes_io_if_needed,
 )
+from unstructured.partition.lang import apply_lang_metadata
 
 DETECTION_ORIGIN: str = "csv"
 

diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -56,13 +56,13 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     convert_ms_office_table_to_text,
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_email_address,

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -15,12 +15,12 @@
     read_txt_file,
     validate_encoding,
 )
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.logger import logger
 from unstructured.partition.common import (
     convert_to_bytes,
     exactly_one,
 )
+from unstructured.partition.lang import apply_lang_metadata
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final

diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py
@@ -7,13 +7,13 @@
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
 from unstructured.partition.html import partition_html
+from unstructured.partition.lang import apply_lang_metadata
 
 DETECTION_ORIGIN: str = "epub"
 

diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
@@ -12,13 +12,13 @@
     FileType,
     add_metadata_with_filetype,
 )
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     document_to_element_list,
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
+from unstructured.partition.lang import apply_lang_metadata
 
 if TYPE_CHECKING:
     from unstructured_inference.inference.layout import DocumentLayout

diff --git a/unstructured/partition/lang.py b/unstructured/partition/lang.py
@@ -1,9 +1,10 @@
 import re
-from typing import List
+from typing import Iterable, Iterator, List
 
 import iso639
 from langdetect import DetectorFactory, detect_langs, lang_detect_exception
 
+from unstructured.documents.elements import Element
 from unstructured.logger import logger
 
 # pytesseract.get_languages(config="") only shows user installed language packs,
@@ -281,3 +282,43 @@ def detect_languages(
                 doc_languages.append(lang)
 
     return doc_languages
+
+
+def apply_lang_metadata(
+    elements: Iterable[Element],
+    languages: List[str],
+    detect_language_per_element: bool = False,
+) -> Iterator[Element]:
+    """Detect and apply metadata.languages to each element in `elements`."""
+    # -- Note this function has a stream interface, but reads the full `elements` stream into memory
+    # -- before emitting the first updated element as output.
+
+    # Skip language detection for partitioners that use other partitioners.
+    # For example, partition_msg relies on partition_html and partition_text, but the metadata
+    # gets overwritten after elements have been returned by _html and _text,
+    # so `languages` would be detected twice.
+    if languages == [""]:
+        yield from elements
+        return
+
+    if not isinstance(elements, List):
+        elements = list(elements)
+
+    full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
+    detected_languages = detect_languages(text=full_text, languages=languages)
+    if (
+        detected_languages is not None
+        and len(languages) == 1
+        and detect_language_per_element is False
+    ):
+        # -- apply detected language to each metadata --
+        for e in elements:
+            e.metadata.languages = detected_languages
+            yield e
+    else:
+        for e in elements:
+            if hasattr(e, "text"):
+                e.metadata.languages = detect_languages(e.text)
+                yield e
+            else:
+                yield e
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
@@ -7,11 +7,11 @@
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import Element, ElementMetadata, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.logger import logger
 from unstructured.partition.common import exactly_one
 from unstructured.partition.email import convert_to_iso_8601
 from unstructured.partition.html import partition_html
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text import partition_text
 
 

diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
@@ -28,13 +28,13 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     convert_ms_office_table_to_text,
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
     is_email_address,
     is_possible_narrative_text,

diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
@@ -22,14 +22,14 @@
 )
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.nlp.tokenize import sent_tokenize
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_email_address,

diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
@@ -11,13 +11,13 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
     spooled_to_bytes_io_if_needed,
 )
+from unstructured.partition.lang import apply_lang_metadata
 
 DETECTION_ORIGIN: str = "tsv"
 

diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
@@ -18,13 +18,13 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
     spooled_to_bytes_io_if_needed,
 )
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_possible_narrative_text,

diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
@@ -14,13 +14,13 @@
 )
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.file_utils.metadata import apply_lang_metadata
 from unstructured.partition.common import (
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
     spooled_to_bytes_io_if_needed,
 )
+from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text import element_from_text
 
 DETECTION_ORIGIN: str = "xml"