Skip to content

Commit

Permalink
add tests for languages arg and raising TypeError
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish committed Oct 6, 2023
1 parent 0d9f135 commit ba79d8a
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 22 deletions.
6 changes: 6 additions & 0 deletions test_unstructured/partition/csv/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,9 @@ def test_partition_csv_element_metadata_has_languages():
filename = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=filename, strategy="fast")
assert elements[0].metadata.languages == ["eng"]


def test_partition_csv_element_metadata_accepts_languages_arg():
filename = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=filename, strategy="fast", languages=["deu"])
assert elements[0].metadata.languages == ["deu"]
6 changes: 6 additions & 0 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,12 @@ def test_partition_docx_detects_multiple_elements_in_other_language():
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


def test_partition_docx_element_metadata_accepts_languages_arg():
filename = "example-docs/handbook-1p.docx"
elements = partition_docx(filename=filename, languages=["deu"])
assert elements[0].metadata.languages == ["deu"]


def test_invalid_languages_arg_raises_TypeError():
with pytest.raises(TypeError):
filename = "example-docs/handbook-1p.docx"
Expand Down
8 changes: 0 additions & 8 deletions test_unstructured/partition/odt/test_odt.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
import pathlib

import pytest

from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, TableChunk, Title
from unstructured.partition.json import partition_json
Expand Down Expand Up @@ -210,9 +208,3 @@ def test_partition_odt_detects_multiple_elements_in_other_language():
elements = partition_odt(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


def test_invalid_languages_arg_raises_TypeError():
with pytest.raises(TypeError):
filename = "example-docs/fake.odt"
partition_odt(filename=filename, languages="eng")
26 changes: 12 additions & 14 deletions test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,27 +525,25 @@ def test_partition_text_element_metadata_has_languages():
assert elements[0].metadata.languages == ["eng"]


# def test_partition_text_detects_quote_in_other_language():
# filename = "example-docs/language-docs/eng_spa.txt"
# elements = partition_text(filename=filename, detect_language_per_element=True)
# langs = list({element.metadata.languages[0] for element in elements})
# assert set(langs) == set(["eng", "spa"])


# def test_partition_text_detects_quotes_in_multiple_languages():
# filename = "example-docs/language-docs/eng_afr_spa.txt"
# elements = partition_text(filename=filename, detect_language_per_element=True)
# langs = list({element.metadata.languages[0] for element in elements})
# assert set(langs) == set(["eng", "spa"])


def test_partition_text_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.txt"
elements = partition_text(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


def test_partition_text_element_metadata_accepts_languages_arg():
filename = "example-docs/norwich-city.txt"
elements = partition_text(filename=filename, languages=["deu"])
assert elements[0].metadata.languages == ["deu"]


def test_partition_text_element_metadata_raises_TypeError():
with pytest.raises(TypeError):
filename = "example-docs/norwich-city.txt"
partition_text(filename=filename, languages="eng")


def test_partition_text_detects_more_than_3_languages():
filename = "example-docs/language-docs/UDHR_first_article_all.txt"
elements = partition_text(filename=filename, detect_language_per_element=True)
Expand Down

0 comments on commit ba79d8a

Please sign in to comment.