diff --git a/unstructured/file_utils/metadata.py b/unstructured/file_utils/metadata.py index 4677651256..e58831b2ce 100644 --- a/unstructured/file_utils/metadata.py +++ b/unstructured/file_utils/metadata.py @@ -1,16 +1,13 @@ import datetime import io from dataclasses import dataclass, field -from typing import IO, Any, Dict, Final, Iterable, Iterator, List, Optional +from typing import IO, Any, Dict, Final, Optional import docx import openpyxl from PIL import Image from PIL.ExifTags import TAGS -from unstructured.documents.elements import Element -from unstructured.partition.lang import detect_languages - # NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S" @@ -153,43 +150,3 @@ def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime # using the standard EXIF datetime format except ValueError: return None - - -def apply_lang_metadata( - elements: Iterable[Element], - languages: List[str], - detect_language_per_element: bool = False, -) -> Iterator[Element]: - """Detect and apply metadata.languages to each element in `elements`.""" - # -- Note this function has a stream interface, but reads the full `elements` stream into memory - # -- before emitting the first updated element as output. - - # Skip language detection for partitioners that use other partitioners. - # For example, partition_msg relies on partition_html and partition_text, but the metadata - # gets overwritten after elements have been returned by _html and _text, - # so `languages` would be detected twice. - if languages == [""]: - yield from elements - return - - if not isinstance(elements, List): - elements = list(elements) - - full_text = " ".join(e.text for e in elements if hasattr(e, "text")) - detected_languages = detect_languages(text=full_text, languages=languages) - if ( - detected_languages is not None - and len(languages) == 1 - and detect_language_per_element is False - ): - # -- apply detected language to each metadata -- - for e in elements: - e.metadata.languages = detected_languages - yield e - else: - for e in elements: - if hasattr(e, "text"): - e.metadata.languages = detect_languages(e.text) - yield e - else: - yield e diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index e62a5f6ba9..3eece564a5 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -12,13 +12,13 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.lang import apply_lang_metadata DETECTION_ORIGIN: str = "csv" diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 8fcc1dd564..a68fcb3ebf 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -56,13 +56,13 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( convert_ms_office_table_to_text, exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_email_address, diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 50cbd787ac..c5d3a0f73a 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -15,12 +15,12 @@ read_txt_file, validate_encoding, ) -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.logger import logger from unstructured.partition.common import ( convert_to_bytes, exactly_one, ) +from unstructured.partition.lang import apply_lang_metadata if sys.version_info < (3, 8): from typing_extensions import Final diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 97ae777bdd..d514f06ac5 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -7,13 +7,13 @@ from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) from unstructured.partition.html import partition_html +from unstructured.partition.lang import apply_lang_metadata DETECTION_ORIGIN: str = "epub" diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index f5ca5c310c..207e79c19e 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -12,13 +12,13 @@ FileType, add_metadata_with_filetype, ) -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( document_to_element_list, exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) +from unstructured.partition.lang import apply_lang_metadata if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout diff --git a/unstructured/partition/lang.py b/unstructured/partition/lang.py index 48c41f1c07..d683c28b00 100644 --- a/unstructured/partition/lang.py +++ b/unstructured/partition/lang.py @@ -1,9 +1,10 @@ import re -from typing import List +from typing import Iterable, Iterator, List import iso639 from langdetect import DetectorFactory, detect_langs, lang_detect_exception +from unstructured.documents.elements import Element from unstructured.logger import logger # pytesseract.get_languages(config="") only shows user installed language packs, @@ -281,3 +282,43 @@ def detect_languages( doc_languages.append(lang) return doc_languages + + +def apply_lang_metadata( + elements: Iterable[Element], + languages: List[str], + detect_language_per_element: bool = False, +) -> Iterator[Element]: + """Detect and apply metadata.languages to each element in `elements`.""" + # -- Note this function has a stream interface, but reads the full `elements` stream into memory + # -- before emitting the first updated element as output. + + # Skip language detection for partitioners that use other partitioners. + # For example, partition_msg relies on partition_html and partition_text, but the metadata + # gets overwritten after elements have been returned by _html and _text, + # so `languages` would be detected twice. + if languages == [""]: + yield from elements + return + + if not isinstance(elements, List): + elements = list(elements) + + full_text = " ".join(e.text for e in elements if hasattr(e, "text")) + detected_languages = detect_languages(text=full_text, languages=languages) + if ( + detected_languages is not None + and len(languages) == 1 + and detect_language_per_element is False + ): + # -- apply detected language to each metadata -- + for e in elements: + e.metadata.languages = detected_languages + yield e + else: + for e in elements: + if hasattr(e, "text"): + e.metadata.languages = detect_languages(e.text) + yield e + else: + yield e diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 93c6dfd025..ec770b92cc 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -7,11 +7,11 @@ from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import Element, ElementMetadata, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.logger import logger from unstructured.partition.common import exactly_one from unstructured.partition.email import convert_to_iso_8601 from unstructured.partition.html import partition_html +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import partition_text diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 38f28a2b9a..3fd2a2c906 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -28,13 +28,13 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( convert_ms_office_table_to_text, exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_email_address, is_possible_narrative_text, diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index e846c75a96..eddfbb7629 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -22,7 +22,6 @@ ) from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.nlp.tokenize import sent_tokenize from unstructured.partition.common import ( @@ -30,6 +29,7 @@ get_last_modified_date, get_last_modified_date_from_file, ) +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_email_address, diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 5f65f062c5..14654c02be 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -11,13 +11,13 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.lang import apply_lang_metadata DETECTION_ORIGIN: str = "tsv" diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index ea96663af2..b5bcf32de6 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -18,13 +18,13 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_possible_narrative_text, diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 31cad9d247..f7bf6e5ef6 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -14,13 +14,13 @@ ) from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.file_utils.metadata import apply_lang_metadata from unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import element_from_text DETECTION_ORIGIN: str = "xml"