Skip to content

Commit

Permalink
move apply_lang_metadata to lang.py to avoid adding unnecessary depen…
Browse files Browse the repository at this point in the history
…dencies from metadata.py to partitioners
  • Loading branch information
Coniferish committed Oct 6, 2023
1 parent d3faf31 commit dd51858
Show file tree
Hide file tree
Showing 13 changed files with 54 additions and 56 deletions.
45 changes: 1 addition & 44 deletions unstructured/file_utils/metadata.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import datetime
import io
from dataclasses import dataclass, field
from typing import IO, Any, Dict, Final, Iterable, Iterator, List, Optional
from typing import IO, Any, Dict, Final, Optional

import docx
import openpyxl
from PIL import Image
from PIL.ExifTags import TAGS

from unstructured.documents.elements import Element
from unstructured.partition.lang import detect_languages

# NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html
EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S"

Expand Down Expand Up @@ -153,43 +150,3 @@ def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime
# using the standard EXIF datetime format
except ValueError:
return None


def apply_lang_metadata(
elements: Iterable[Element],
languages: List[str],
detect_language_per_element: bool = False,
) -> Iterator[Element]:
"""Detect and apply metadata.languages to each element in `elements`."""
# -- Note this function has a stream interface, but reads the full `elements` stream into memory
# -- before emitting the first updated element as output.

# Skip language detection for partitioners that use other partitioners.
# For example, partition_msg relies on partition_html and partition_text, but the metadata
# gets overwritten after elements have been returned by _html and _text,
# so `languages` would be detected twice.
if languages == [""]:
yield from elements
return

if not isinstance(elements, List):
elements = list(elements)

full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
detected_languages = detect_languages(text=full_text, languages=languages)
if (
detected_languages is not None
and len(languages) == 1
and detect_language_per_element is False
):
# -- apply detected language to each metadata --
for e in elements:
e.metadata.languages = detected_languages
yield e
else:
for e in elements:
if hasattr(e, "text"):
e.metadata.languages = detect_languages(e.text)
yield e
else:
yield e
2 changes: 1 addition & 1 deletion unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata

DETECTION_ORIGIN: str = "csv"

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
convert_ms_office_table_to_text,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
read_txt_file,
validate_encoding,
)
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.logger import logger
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
)
from unstructured.partition.lang import apply_lang_metadata

if sys.version_info < (3, 8):
from typing_extensions import Final
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
from unstructured.chunking.title import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.html import partition_html
from unstructured.partition.lang import apply_lang_metadata

DETECTION_ORIGIN: str = "epub"

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
FileType,
add_metadata_with_filetype,
)
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
document_to_element_list,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata

if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout
Expand Down
43 changes: 42 additions & 1 deletion unstructured/partition/lang.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
from typing import List
from typing import Iterable, Iterator, List

import iso639
from langdetect import DetectorFactory, detect_langs, lang_detect_exception

from unstructured.documents.elements import Element
from unstructured.logger import logger

# pytesseract.get_languages(config="") only shows user installed language packs,
Expand Down Expand Up @@ -281,3 +282,43 @@ def detect_languages(
doc_languages.append(lang)

return doc_languages


def apply_lang_metadata(
elements: Iterable[Element],
languages: List[str],
detect_language_per_element: bool = False,
) -> Iterator[Element]:
"""Detect and apply metadata.languages to each element in `elements`."""
# -- Note this function has a stream interface, but reads the full `elements` stream into memory
# -- before emitting the first updated element as output.

# Skip language detection for partitioners that use other partitioners.
# For example, partition_msg relies on partition_html and partition_text, but the metadata
# gets overwritten after elements have been returned by _html and _text,
# so `languages` would be detected twice.
if languages == [""]:
yield from elements
return

if not isinstance(elements, List):
elements = list(elements)

full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
detected_languages = detect_languages(text=full_text, languages=languages)
if (
detected_languages is not None
and len(languages) == 1
and detect_language_per_element is False
):
# -- apply detected language to each metadata --
for e in elements:
e.metadata.languages = detected_languages
yield e
else:
for e in elements:
if hasattr(e, "text"):
e.metadata.languages = detect_languages(e.text)
yield e
else:
yield e
2 changes: 1 addition & 1 deletion unstructured/partition/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from unstructured.chunking.title import add_chunking_strategy
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.email import convert_to_iso_8601
from unstructured.partition.html import partition_html
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import partition_text


Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
convert_ms_office_table_to_text,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
is_email_address,
is_possible_narrative_text,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.nlp.tokenize import sent_tokenize
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata

DETECTION_ORIGIN: str = "tsv"

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.file_utils.metadata import apply_lang_metadata
from unstructured.partition.common import (
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import element_from_text

DETECTION_ORIGIN: str = "xml"
Expand Down

0 comments on commit dd51858

Please sign in to comment.