From dec120d273486bae6a5d867d352df887582a3cb5 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 12 Oct 2023 13:41:15 +0200 Subject: [PATCH 01/12] add DocumentLanguageClassifier and tests --- .../pipelines/test_preprocessing_pipeline.py | 71 +++++++++++++++ .../components/preprocessors/__init__.py | 3 +- .../document_language_classifier.py | 87 +++++++++++++++++++ .../test_document_language_classifier.py | 58 +++++++++++++ 4 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 e2e/preview/pipelines/test_preprocessing_pipeline.py create mode 100644 haystack/preview/components/preprocessors/document_language_classifier.py create mode 100644 test/preview/components/preprocessors/test_document_language_classifier.py diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py new file mode 100644 index 0000000000..b99ec89a56 --- /dev/null +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -0,0 +1,71 @@ +import json + +from haystack.preview import Pipeline +from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.preview.components.file_converters import TextFileToDocument +from haystack.preview.components.preprocessors import TextDocumentSplitter, TextDocumentCleaner +from haystack.preview.components.routers import FileTypeRouter, DocumentLanguageClassifier +from haystack.preview.components.writers import DocumentWriter +from haystack.preview.document_stores import MemoryDocumentStore + + +def test_preprocessing_pipeline(tmp_path): + # Create the pipeline and its components + document_store = MemoryDocumentStore() + preprocessing_pipeline = Pipeline() + preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router") + preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") + preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") + preprocessing_pipeline.add_component(instance=TextDocumentCleaner(), name="cleaner") + preprocessing_pipeline.add_component( + instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter" + ) + preprocessing_pipeline.add_component( + instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="embedder", + ) + preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer") + preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.paths") + preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents") + preprocessing_pipeline.connect("language_classifier.documents", "cleaner.documents") + preprocessing_pipeline.connect("cleaner.documents", "splitter.documents") + preprocessing_pipeline.connect("splitter.documents", "embedder.documents") + preprocessing_pipeline.connect("embedder.documents", "writer.documents") + + # Draw the pipeline + preprocessing_pipeline.draw(tmp_path / "test_preprocessing_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_preprocessing_pipeline.json", "w") as f: + print(json.dumps(preprocessing_pipeline.to_dict(), indent=4)) + json.dump(preprocessing_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_preprocessing_pipeline.json", "r") as f: + preprocessing_pipeline = Pipeline.from_dict(json.load(f)) + + # Write a txt file + with open(tmp_path / "test_file_english.txt", "w") as f: + f.write( + "This is an english sentence. There is more to it. It's a long text." + "Spans multiple lines." + "" + "Even contains empty lines. And extra whitespaces." + ) + + # Write a txt file + with open(tmp_path / "test_file_german.txt", "w") as f: + f.write("Ein deutscher Satz ohne Verb.") + + # Add two txt files and one non-txt file + paths = [ + tmp_path / "test_file_english.txt", + tmp_path / "test_file_german.txt", + tmp_path / "test_preprocessing_pipeline.json", + ] + + result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}}) + + # TODO Add more assertions + assert result["writer"]["documents_written"] == 6 + assert document_store.count_documents() == 6 diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py index 33a0e2cd18..b22547c14a 100644 --- a/haystack/preview/components/preprocessors/__init__.py +++ b/haystack/preview/components/preprocessors/__init__.py @@ -1,3 +1,4 @@ from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter +from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier -__all__ = ["TextDocumentSplitter"] +__all__ = ["TextDocumentSplitter", "DocumentLanguageClassifier"] diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py new file mode 100644 index 0000000000..972b3f57c5 --- /dev/null +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -0,0 +1,87 @@ +import logging +from typing import List, Dict, Any, Optional + +from haystack.preview import component, default_from_dict, default_to_dict, Document +from haystack.preview.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + +with LazyImport("Run 'pip install langdetect'") as langdetect_import: + import langdetect + + +@component +class DocumentLanguageClassifier: + """ + Routes documents onto different output connections depending on their language. + This is useful for routing documents to different models in a pipeline depending on their language. + The set of supported languages can be specified. + For routing texts based on their language use the related TextLanguageClassifier component. + + Example usage in and indexing pipeline that writes only English language documents to a Store: + document_store = MemoryDocumentStore() + p = Pipeline() + p.add_component(instance=TextFileToDocument(), name="text_file_converter") + p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") + p.add_component(instance=TextDocumentSplitter(), name="splitter") + p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") + p.connect("text_file_converter.documents", "language_classifier.documents") + p.connect("language_classifier.documents", "splitter.documents") + p.connect("splitter.documents", "writer.documents") + """ + + def __init__(self, languages: Optional[List[str]] = None): + """ + :param languages: A list of languages in ISO code, each corresponding to a different output connection (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). By default, only ["en"] is supported and Documents of any other language are routed to "unmatched". + """ + langdetect_import.check() + if not languages: + languages = ["en"] + self.languages = languages + component.set_output_types(self, unmatched=List[str], **{language: List[str] for language in languages}) + + def run(self, documents: List[Document]): + """ + Run the DocumentLanguageClassifier. This method routes the documents to different edges based on their language. + If a Document's text does not match any of the languages specified at initialization, it is routed to + a connection named "unmatched". + + :param documents: A list of documents to route to different edges. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError( + "DocumentLanguageClassifier expects a list of Document as input. In case you want to classify a text, please use the TextLanguageClassifier." + ) + + output: Dict[str, List[Document]] = {language: [] for language in self.languages} + output["unmatched"] = [] + + for document in documents: + detected_language = self.detect_language(document) + if detected_language in self.languages: + output[detected_language].append(document) + else: + output["unmatched"].append(document) + + return output + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, languages=self.languages) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentLanguageClassifier": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def detect_language(self, document: Document) -> Optional[str]: + try: + language = langdetect.detect(document.text) + except langdetect.LangDetectException: + logger.warning("Langdetect cannot detect the language of Document with id: %s", document.id) + language = None + return language diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py new file mode 100644 index 0000000000..838d437c12 --- /dev/null +++ b/test/preview/components/preprocessors/test_document_language_classifier.py @@ -0,0 +1,58 @@ +import logging +import pytest + +from haystack.preview import Document +from haystack.preview.components.preprocessors import DocumentLanguageClassifier + + +class TestDocumentLanguageClassifier: + @pytest.mark.unit + def test_to_dict(self): + component = DocumentLanguageClassifier(languages=["en", "de"]) + data = component.to_dict() + assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}} + component = DocumentLanguageClassifier.from_dict(data) + assert component.languages == ["en", "de"] + + @pytest.mark.unit + def test_non_document_input(self): + with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."): + classifier = DocumentLanguageClassifier() + classifier.run(documents="This is an english sentence.") + + @pytest.mark.unit + def test_single_document(self): + with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."): + classifier = DocumentLanguageClassifier() + classifier.run(documents=Document(text="This is an english sentence.")) + + @pytest.mark.unit + def test_empty_list(self): + classifier = DocumentLanguageClassifier() + result = classifier.run(documents=[]) + assert result == {"en": [], "unmatched": []} + + @pytest.mark.unit + def test_detect_language(self): + classifier = DocumentLanguageClassifier() + detected_language = classifier.detect_language(Document(text="This is an english sentence.")) + assert detected_language == "en" + + @pytest.mark.unit + def test_route_to_en_and_unmatched(self): + classifier = DocumentLanguageClassifier() + english_document = Document(text="This is an english sentence.") + german_document = Document(text="Ein deutscher Satz ohne Verb.") + result = classifier.run(documents=[english_document, german_document]) + assert result == {"en": [english_document], "unmatched": [german_document]} + + @pytest.mark.unit + def test_warning_if_no_language_detected(self, caplog): + with caplog.at_level(logging.WARNING): + classifier = DocumentLanguageClassifier() + classifier.run(documents=[Document(text=".")]) + assert "Langdetect cannot detect the language of Document with id" in caplog.text From 6b0408e93953139bc59704f39e4a817be4c9739c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 12 Oct 2023 13:46:30 +0200 Subject: [PATCH 02/12] reno --- .../notes/document-language-classifier-1ec0b3c4d08989c0.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml diff --git a/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml b/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml new file mode 100644 index 0000000000..07372290f3 --- /dev/null +++ b/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Added DocumentLanguageClassifier component so that Documents can be routed to different components based on the detected language for example during preprocessing. From 2837d69c8af02e09f7c9e036c338b0c5b6dda9cd Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 09:14:01 +0200 Subject: [PATCH 03/12] fix import, rename DocumentCleaner --- e2e/preview/pipelines/test_preprocessing_pipeline.py | 6 +++--- .../test_document_language_classifier.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index b99ec89a56..9dac07b8d1 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -3,8 +3,8 @@ from haystack.preview import Pipeline from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder from haystack.preview.components.file_converters import TextFileToDocument -from haystack.preview.components.preprocessors import TextDocumentSplitter, TextDocumentCleaner -from haystack.preview.components.routers import FileTypeRouter, DocumentLanguageClassifier +from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier +from haystack.preview.components.routers import FileTypeRouter from haystack.preview.components.writers import DocumentWriter from haystack.preview.document_stores import MemoryDocumentStore @@ -16,7 +16,7 @@ def test_preprocessing_pipeline(tmp_path): preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router") preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") - preprocessing_pipeline.add_component(instance=TextDocumentCleaner(), name="cleaner") + preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") preprocessing_pipeline.add_component( instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter" ) diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py index 838d437c12..a7ab826ac8 100644 --- a/test/preview/components/preprocessors/test_document_language_classifier.py +++ b/test/preview/components/preprocessors/test_document_language_classifier.py @@ -6,8 +6,19 @@ class TestDocumentLanguageClassifier: + @pytest.mark.unit + def test_init(self): + component = DocumentLanguageClassifier() + assert component.languages == ["en"] + @pytest.mark.unit def test_to_dict(self): + component = DocumentLanguageClassifier() + data = component.to_dict() + assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en"]}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): component = DocumentLanguageClassifier(languages=["en", "de"]) data = component.to_dict() assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}} From 0ad656be9bc3f829740fa17105c7571d91b17c91 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 10:05:07 +0200 Subject: [PATCH 04/12] mark example usage as python code --- .../components/preprocessors/document_language_classifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index 972b3f57c5..f0cfc40189 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -19,6 +19,8 @@ class DocumentLanguageClassifier: For routing texts based on their language use the related TextLanguageClassifier component. Example usage in and indexing pipeline that writes only English language documents to a Store: + + ```python document_store = MemoryDocumentStore() p = Pipeline() p.add_component(instance=TextFileToDocument(), name="text_file_converter") @@ -28,6 +30,7 @@ class DocumentLanguageClassifier: p.connect("text_file_converter.documents", "language_classifier.documents") p.connect("language_classifier.documents", "splitter.documents") p.connect("splitter.documents", "writer.documents") + ``` """ def __init__(self, languages: Optional[List[str]] = None): From 6edebc03dbb5769784c48e571957547d7495b7be Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 14:38:05 +0200 Subject: [PATCH 05/12] add assertions to e2e test --- .../pipelines/test_preprocessing_pipeline.py | 16 ++++++++++++++-- .../document_language_classifier.py | 4 +++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index 9dac07b8d1..74f59724c8 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -27,7 +27,7 @@ def test_preprocessing_pipeline(tmp_path): preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer") preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.paths") preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents") - preprocessing_pipeline.connect("language_classifier.documents", "cleaner.documents") + preprocessing_pipeline.connect("language_classifier.en", "cleaner.documents") preprocessing_pipeline.connect("cleaner.documents", "splitter.documents") preprocessing_pipeline.connect("splitter.documents", "embedder.documents") preprocessing_pipeline.connect("embedder.documents", "writer.documents") @@ -66,6 +66,18 @@ def test_preprocessing_pipeline(tmp_path): result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}}) - # TODO Add more assertions assert result["writer"]["documents_written"] == 6 assert document_store.count_documents() == 6 + + # Check preprocessed texts and mime_types + stored_documents = document_store.filter_documents() + expected_texts = [ + "This is an english sentence.", + " There is more to it.", + " It's a long text.", + "Spans multiple lines.", + "Even contains empty lines.", + " And extra whitespaces.", + ] + assert expected_texts == [document.text for document in stored_documents] + assert all(document.mime_type == "text/plain" for document in stored_documents) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index f0cfc40189..e7edde1023 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -41,7 +41,9 @@ def __init__(self, languages: Optional[List[str]] = None): if not languages: languages = ["en"] self.languages = languages - component.set_output_types(self, unmatched=List[str], **{language: List[str] for language in languages}) + component.set_output_types( + self, unmatched=List[Document], **{language: List[Document] for language in languages} + ) def run(self, documents: List[Document]): """ From 621823ba608990dcb15c1af891b35867c50caed6 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 17 Oct 2023 09:23:40 +0200 Subject: [PATCH 06/12] use deserialized document_store --- e2e/preview/pipelines/test_preprocessing_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index 74f59724c8..dc21d1b843 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -67,10 +67,11 @@ def test_preprocessing_pipeline(tmp_path): result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}}) assert result["writer"]["documents_written"] == 6 - assert document_store.count_documents() == 6 + filled_document_store = preprocessing_pipeline.get_component("writer").document_store + assert filled_document_store.count_documents() == 6 # Check preprocessed texts and mime_types - stored_documents = document_store.filter_documents() + stored_documents = filled_document_store.filter_documents() expected_texts = [ "This is an english sentence.", " There is more to it.", From 470d0268fb3abed43fc0135264258f9d282a46c2 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 18 Oct 2023 13:06:54 +0200 Subject: [PATCH 07/12] Apply suggestions from code review Co-authored-by: Massimiliano Pippi --- .../components/preprocessors/document_language_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index e7edde1023..297216ea07 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -16,9 +16,9 @@ class DocumentLanguageClassifier: Routes documents onto different output connections depending on their language. This is useful for routing documents to different models in a pipeline depending on their language. The set of supported languages can be specified. - For routing texts based on their language use the related TextLanguageClassifier component. + For routing plain text using the same logic, use the related TextLanguageClassifier component instead. - Example usage in and indexing pipeline that writes only English language documents to a Store: + Example usage within an indexing pipeline, storing only documents written in English language documents to a Store: ```python document_store = MemoryDocumentStore() From 32654b14e4a4cb652fe5d3408d022f8bd2bea5d8 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 18 Oct 2023 13:10:48 +0200 Subject: [PATCH 08/12] remove from/to_dict --- .../document_language_classifier.py | 21 +++---------------- .../test_document_language_classifier.py | 18 ---------------- 2 files changed, 3 insertions(+), 36 deletions(-) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index 297216ea07..6d3e55038d 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -1,7 +1,7 @@ import logging -from typing import List, Dict, Any, Optional +from typing import List, Dict, Optional -from haystack.preview import component, default_from_dict, default_to_dict, Document +from haystack.preview import component, Document from haystack.preview.lazy_imports import LazyImport logger = logging.getLogger(__name__) @@ -25,11 +25,9 @@ class DocumentLanguageClassifier: p = Pipeline() p.add_component(instance=TextFileToDocument(), name="text_file_converter") p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") - p.add_component(instance=TextDocumentSplitter(), name="splitter") p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") p.connect("text_file_converter.documents", "language_classifier.documents") - p.connect("language_classifier.documents", "splitter.documents") - p.connect("splitter.documents", "writer.documents") + p.connect("language_classifier.en", "writer.documents") ``` """ @@ -70,19 +68,6 @@ def run(self, documents: List[Document]): return output - def to_dict(self) -> Dict[str, Any]: - """ - Serialize this component to a dictionary. - """ - return default_to_dict(self, languages=self.languages) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "DocumentLanguageClassifier": - """ - Deserialize this component from a dictionary. - """ - return default_from_dict(cls, data) - def detect_language(self, document: Document) -> Optional[str]: try: language = langdetect.detect(document.text) diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py index a7ab826ac8..88e0ba88c7 100644 --- a/test/preview/components/preprocessors/test_document_language_classifier.py +++ b/test/preview/components/preprocessors/test_document_language_classifier.py @@ -11,24 +11,6 @@ def test_init(self): component = DocumentLanguageClassifier() assert component.languages == ["en"] - @pytest.mark.unit - def test_to_dict(self): - component = DocumentLanguageClassifier() - data = component.to_dict() - assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en"]}} - - @pytest.mark.unit - def test_to_dict_with_custom_init_parameters(self): - component = DocumentLanguageClassifier(languages=["en", "de"]) - data = component.to_dict() - assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}} - - @pytest.mark.unit - def test_from_dict(self): - data = {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}} - component = DocumentLanguageClassifier.from_dict(data) - assert component.languages == ["en", "de"] - @pytest.mark.unit def test_non_document_input(self): with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."): From d9e1fd9b4d4e8ed7d385772e157deb471159e8e9 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 18 Oct 2023 13:39:09 +0200 Subject: [PATCH 09/12] use renamed InMemoryDocumentStore --- e2e/preview/pipelines/test_preprocessing_pipeline.py | 4 ++-- .../components/preprocessors/document_language_classifier.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index dc21d1b843..ec6d5356e6 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -6,12 +6,12 @@ from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier from haystack.preview.components.routers import FileTypeRouter from haystack.preview.components.writers import DocumentWriter -from haystack.preview.document_stores import MemoryDocumentStore +from haystack.preview.document_stores import InMemoryDocumentStore def test_preprocessing_pipeline(tmp_path): # Create the pipeline and its components - document_store = MemoryDocumentStore() + document_store = InMemoryDocumentStore() preprocessing_pipeline = Pipeline() preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router") preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index 6d3e55038d..72b4ed11c6 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -21,7 +21,7 @@ class DocumentLanguageClassifier: Example usage within an indexing pipeline, storing only documents written in English language documents to a Store: ```python - document_store = MemoryDocumentStore() + document_store = InMemoryDocumentStore() p = Pipeline() p.add_component(instance=TextFileToDocument(), name="text_file_converter") p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") From 322d74a8d924c8a0b1e736c331fda1ae4babed70 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 31 Oct 2023 14:55:05 +0100 Subject: [PATCH 10/12] adapt to Document refactoring --- e2e/preview/pipelines/test_preprocessing_pipeline.py | 2 +- .../preprocessors/document_language_classifier.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index ec6d5356e6..5b32c642a4 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -80,5 +80,5 @@ def test_preprocessing_pipeline(tmp_path): "Even contains empty lines.", " And extra whitespaces.", ] - assert expected_texts == [document.text for document in stored_documents] + assert expected_texts == [document.content for document in stored_documents] assert all(document.mime_type == "text/plain" for document in stored_documents) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index 72b4ed11c6..b3707e2295 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -33,7 +33,9 @@ class DocumentLanguageClassifier: def __init__(self, languages: Optional[List[str]] = None): """ - :param languages: A list of languages in ISO code, each corresponding to a different output connection (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). By default, only ["en"] is supported and Documents of any other language are routed to "unmatched". + :param languages: A list of languages in ISO code, each corresponding to a different output connection + (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). + By default, only ["en"] is supported and Documents of any other language are routed to "unmatched". """ langdetect_import.check() if not languages: @@ -53,7 +55,8 @@ def run(self, documents: List[Document]): """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): raise TypeError( - "DocumentLanguageClassifier expects a list of Document as input. In case you want to classify a text, please use the TextLanguageClassifier." + "DocumentLanguageClassifier expects a list of Document as input. " + "In case you want to classify a text, please use the TextLanguageClassifier." ) output: Dict[str, List[Document]] = {language: [] for language in self.languages} @@ -70,7 +73,7 @@ def run(self, documents: List[Document]): def detect_language(self, document: Document) -> Optional[str]: try: - language = langdetect.detect(document.text) + language = langdetect.detect(document.content) except langdetect.LangDetectException: logger.warning("Langdetect cannot detect the language of Document with id: %s", document.id) language = None From 8e33dbc966528076c8315a23f0bff4bdb812fa59 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 31 Oct 2023 15:00:52 +0100 Subject: [PATCH 11/12] improve docstring --- .../components/preprocessors/document_language_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py index b3707e2295..98198cca12 100644 --- a/haystack/preview/components/preprocessors/document_language_classifier.py +++ b/haystack/preview/components/preprocessors/document_language_classifier.py @@ -18,7 +18,8 @@ class DocumentLanguageClassifier: The set of supported languages can be specified. For routing plain text using the same logic, use the related TextLanguageClassifier component instead. - Example usage within an indexing pipeline, storing only documents written in English language documents to a Store: + Example usage within an indexing pipeline, storing in a Document Store + only documents written in English: ```python document_store = InMemoryDocumentStore() From 0cb60028b8191ae6c0754c580beaa9bed2ac95fd Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 31 Oct 2023 15:14:26 +0100 Subject: [PATCH 12/12] fix test for new Document --- .../preprocessors/test_document_language_classifier.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py index 88e0ba88c7..e666649cec 100644 --- a/test/preview/components/preprocessors/test_document_language_classifier.py +++ b/test/preview/components/preprocessors/test_document_language_classifier.py @@ -21,7 +21,7 @@ def test_non_document_input(self): def test_single_document(self): with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."): classifier = DocumentLanguageClassifier() - classifier.run(documents=Document(text="This is an english sentence.")) + classifier.run(documents=Document(content="This is an english sentence.")) @pytest.mark.unit def test_empty_list(self): @@ -32,14 +32,14 @@ def test_empty_list(self): @pytest.mark.unit def test_detect_language(self): classifier = DocumentLanguageClassifier() - detected_language = classifier.detect_language(Document(text="This is an english sentence.")) + detected_language = classifier.detect_language(Document(content="This is an english sentence.")) assert detected_language == "en" @pytest.mark.unit def test_route_to_en_and_unmatched(self): classifier = DocumentLanguageClassifier() - english_document = Document(text="This is an english sentence.") - german_document = Document(text="Ein deutscher Satz ohne Verb.") + english_document = Document(content="This is an english sentence.") + german_document = Document(content="Ein deutscher Satz ohne Verb.") result = classifier.run(documents=[english_document, german_document]) assert result == {"en": [english_document], "unmatched": [german_document]} @@ -47,5 +47,5 @@ def test_route_to_en_and_unmatched(self): def test_warning_if_no_language_detected(self, caplog): with caplog.at_level(logging.WARNING): classifier = DocumentLanguageClassifier() - classifier.run(documents=[Document(text=".")]) + classifier.run(documents=[Document(content=".")]) assert "Langdetect cannot detect the language of Document with id" in caplog.text