Skip to content

Commit

Permalink
add comment about why detect_languages_per_element is not included in…
Browse files Browse the repository at this point in the history
… csv and tsv partitions and add param to xlsx partition
  • Loading branch information
Coniferish committed Oct 6, 2023
1 parent 0f2c251 commit 0d9f135
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 11 deletions.
Binary file added example-docs/language-docs/eng_spa.xlsx
Binary file not shown.
Binary file removed example-docs/language-docs/eng_spa_mult.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion test_unstructured/partition/csv/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def test_add_chunking_strategy_to_partition_csv_non_default():


# NOTE (jennings) partition_csv returns a single TableElement per sheet,
# so no adding tests for multiple languages like the other partitions
# so leaving off additional tests for multiple languages like the other partitions
def test_partition_csv_element_metadata_has_languages():
filename = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=filename, strategy="fast")
Expand Down
10 changes: 8 additions & 2 deletions test_unstructured/partition/xlsx/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,15 @@ def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"):
assert len(elements) == 6


# NOTE (jennings) partition_xlsx returns a single TableElement per sheet,
# so no adding tests for multiple languages like the other partitions
def test_partition_xlsx_element_metadata_has_languages():
filename = "example-docs/stanley-cups.xlsx"
elements = partition_xlsx(filename=filename)
assert elements[0].metadata.languages == ["eng"]


def test_partition_eml_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa.xlsx"
elements = partition_xlsx(filename=filename, detect_language_per_element=True)
langs = {element.metadata.languages[0] for element in elements}
assert "eng" in langs
assert "spa" in langs
4 changes: 3 additions & 1 deletion unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def partition_csv(
metadata_last_modified: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
# NOTE (jennings) partition_csv generates a single TableElement
# so detect_language_per_element is not included as a param
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
Expand Down Expand Up @@ -81,7 +83,7 @@ def partition_csv(
metadata = ElementMetadata()

elements = apply_lang_metadata(
[Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN))],
[Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)],
languages=languages,
)

Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def partition_tsv(
metadata_last_modified: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
# NOTE (jennings) partition_tsv generates a single TableElement
# so detect_language_per_element is not included as a param
**kwargs,
) -> List[Element]:
"""Partitions TSV files into document elements.
Expand Down
12 changes: 5 additions & 7 deletions unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import detect_languages
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
Expand All @@ -45,6 +44,7 @@ def partition_xlsx(
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
detect_language_per_element: bool = False,
metadata_last_modified: Optional[str] = None,
include_header: bool = False,
find_subtable: bool = True,
Expand All @@ -64,6 +64,9 @@ def partition_xlsx(
User defined value for metadata.languages if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
The day of the last modification
include_header
Expand Down Expand Up @@ -140,10 +143,8 @@ def partition_xlsx(

if front_non_consecutive is not None:
for content in single_non_empty_row_contents[: front_non_consecutive + 1]:
languages = detect_languages(str(content), languages)
element = _check_content_element_type(str(content))
element.metadata = metadata
element.metadata.languages = languages
elements.append(element)

if subtable is not None and len(subtable) == 1:
Expand All @@ -154,27 +155,24 @@ def partition_xlsx(
# parse subtables as html
html_text = subtable.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
languages = detect_languages(text, languages)
subtable = Table(text=text)
subtable.metadata = metadata
subtable.metadata.text_as_html = html_text
subtable.metadata.languages = languages
elements.append(subtable)

if front_non_consecutive is not None and last_non_consecutive is not None:
for content in single_non_empty_row_contents[
front_non_consecutive + 1 : # noqa: E203
]:
languages = detect_languages(str(content), languages)
element = _check_content_element_type(str(content))
element.metadata = metadata
element.metadata.languages = languages
elements.append(element)

elements = list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
return elements
Expand Down

0 comments on commit 0d9f135

Please sign in to comment.