Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unstructured: fix metadata order mixed up #336

Merged
merged 22 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
eb929c5
Optional meta field for UnstructuredFileConverter with proper tests
lambda-science Jan 18, 2024
a0da2e8
black lint
lambda-science Jan 18, 2024
b310858
Merge branch 'main' into feat/unstructured_meta_field
lambda-science Jan 18, 2024
4a90be7
Adding multiple files and meta list test case
lambda-science Jan 18, 2024
3bc51a7
Black formatting test
lambda-science Jan 18, 2024
86eec4d
Fixing metadata page number bug. Deep copy of dict
lambda-science Jan 19, 2024
d06e2c3
Merge branch 'main' into feat/unstructured_meta_field
Jan 19, 2024
99a2847
Folder of files test
lambda-science Jan 22, 2024
3a80a94
Merge remote-tracking branch 'origin/feat/unstructured_meta_field' in…
lambda-science Jan 22, 2024
cabf5d8
Update integrations/unstructured/src/haystack_integrations/components…
Jan 23, 2024
52b0663
Update integrations/unstructured/src/haystack_integrations/components…
Jan 23, 2024
0f81c16
Update integrations/unstructured/src/haystack_integrations/components…
Jan 23, 2024
a7d9b74
Renaming "name" meta to "file_path" and deepcopy fix
lambda-science Jan 23, 2024
736f699
Fix Ruff Complaining
lambda-science Jan 23, 2024
096ab49
Removing unique file logic using set that does not preserve file orde…
lambda-science Feb 4, 2024
e2b3852
Better test to make sure metadata order are preserved.
lambda-science Feb 4, 2024
1c4802e
Make a failing test if metadata list and directory
lambda-science Feb 4, 2024
dd40d0a
filepaths as lists
lambda-science Feb 4, 2024
0199e3c
Merge branch 'main' into feat/unstructured_meta_field
lambda-science Feb 4, 2024
7358a0a
Update integrations/unstructured/src/haystack_integrations/components…
Feb 5, 2024
534d1b1
update meta docstrings
lambda-science Feb 5, 2024
5e9e46d
Merge branch 'main' into feat/unstructured_meta_field
lambda-science Feb 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,23 @@ def run(
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of paths, because the two lists will be zipped.
Please note that if the paths contain directories, the length of the meta list must match
the actual number of files contained.
Please note that if the paths contain directories, meta can only be a single dictionary
(same metadata for all files).
Defaults to `None`.
"""

unique_paths = {Path(path) for path in paths}
filepaths = {path for path in unique_paths if path.is_file()}
filepaths_in_directories = {
filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file()
}

all_filepaths = filepaths.union(filepaths_in_directories)

paths_obj = [Path(path) for path in paths]
filepaths = [path for path in paths_obj if path.is_file()]
filepaths_in_directories = [
filepath for path in paths_obj if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file()
]
if filepaths_in_directories and isinstance(meta, list):
error = """"If providing directories in the `paths` parameter,
`meta` can only be a dictionary (metadata applied to every file),
and not a list. To specify different metadata for each file,
provide an explicit list of direct paths instead."""
raise ValueError(error)

all_filepaths = filepaths + filepaths_in_directories
# currently, the files are converted sequentially to gently handle API failures
documents = []
meta_list = normalize_metadata(meta, sources_count=len(all_filepaths))
Expand Down
19 changes: 17 additions & 2 deletions integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,10 @@ def test_run_one_doc_per_element_with_meta(self, samples_path):
@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
meta = [
{"custom_meta": "sample_pdf.pdf", "common_meta": "common"},
{"custom_meta": "sample_pdf2.pdf", "common_meta": "common"},
]
local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
Expand All @@ -163,6 +166,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert doc.meta["custom_meta"] == doc.meta["filename"]
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
Expand All @@ -171,9 +175,20 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
assert doc.meta["common_meta"] == "common"

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
def test_run_one_doc_per_element_with_meta_list_folder_fail(self, samples_path):
pdf_path = [samples_path]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
with pytest.raises(ValueError):
local_converter.run(paths=pdf_path, meta=meta)["documents"]

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
pdf_path = [samples_path]
meta = {"common_meta": "common"}

local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
Expand Down