From dec120d273486bae6a5d867d352df887582a3cb5 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 12 Oct 2023 13:41:15 +0200
Subject: [PATCH 01/12] add DocumentLanguageClassifier and tests

---
 .../pipelines/test_preprocessing_pipeline.py  | 71 +++++++++++++++
 .../components/preprocessors/__init__.py      |  3 +-
 .../document_language_classifier.py           | 87 +++++++++++++++++++
 .../test_document_language_classifier.py      | 58 +++++++++++++
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 e2e/preview/pipelines/test_preprocessing_pipeline.py
 create mode 100644 haystack/preview/components/preprocessors/document_language_classifier.py
 create mode 100644 test/preview/components/preprocessors/test_document_language_classifier.py

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
new file mode 100644
index 0000000000..b99ec89a56
--- /dev/null
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -0,0 +1,71 @@
+import json
+
+from haystack.preview import Pipeline
+from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.preview.components.file_converters import TextFileToDocument
+from haystack.preview.components.preprocessors import TextDocumentSplitter, TextDocumentCleaner
+from haystack.preview.components.routers import FileTypeRouter, DocumentLanguageClassifier
+from haystack.preview.components.writers import DocumentWriter
+from haystack.preview.document_stores import MemoryDocumentStore
+
+
+def test_preprocessing_pipeline(tmp_path):
+    # Create the pipeline and its components
+    document_store = MemoryDocumentStore()
+    preprocessing_pipeline = Pipeline()
+    preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")
+    preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
+    preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
+    preprocessing_pipeline.add_component(instance=TextDocumentCleaner(), name="cleaner")
+    preprocessing_pipeline.add_component(
+        instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter"
+    )
+    preprocessing_pipeline.add_component(
+        instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+        name="embedder",
+    )
+    preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
+    preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.paths")
+    preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")
+    preprocessing_pipeline.connect("language_classifier.documents", "cleaner.documents")
+    preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")
+    preprocessing_pipeline.connect("splitter.documents", "embedder.documents")
+    preprocessing_pipeline.connect("embedder.documents", "writer.documents")
+
+    # Draw the pipeline
+    preprocessing_pipeline.draw(tmp_path / "test_preprocessing_pipeline.png")
+
+    # Serialize the pipeline to JSON
+    with open(tmp_path / "test_preprocessing_pipeline.json", "w") as f:
+        print(json.dumps(preprocessing_pipeline.to_dict(), indent=4))
+        json.dump(preprocessing_pipeline.to_dict(), f)
+
+    # Load the pipeline back
+    with open(tmp_path / "test_preprocessing_pipeline.json", "r") as f:
+        preprocessing_pipeline = Pipeline.from_dict(json.load(f))
+
+    # Write a txt file
+    with open(tmp_path / "test_file_english.txt", "w") as f:
+        f.write(
+            "This is an english sentence. There is more to it. It's a long text."
+            "Spans multiple lines."
+            ""
+            "Even contains empty lines.  And extra whitespaces."
+        )
+
+    # Write a txt file
+    with open(tmp_path / "test_file_german.txt", "w") as f:
+        f.write("Ein deutscher Satz ohne Verb.")
+
+    # Add two txt files and one non-txt file
+    paths = [
+        tmp_path / "test_file_english.txt",
+        tmp_path / "test_file_german.txt",
+        tmp_path / "test_preprocessing_pipeline.json",
+    ]
+
+    result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})
+
+    # TODO Add more assertions
+    assert result["writer"]["documents_written"] == 6
+    assert document_store.count_documents() == 6
diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py
index 33a0e2cd18..b22547c14a 100644
--- a/haystack/preview/components/preprocessors/__init__.py
+++ b/haystack/preview/components/preprocessors/__init__.py
@@ -1,3 +1,4 @@
 from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter
+from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier
 
-__all__ = ["TextDocumentSplitter"]
+__all__ = ["TextDocumentSplitter", "DocumentLanguageClassifier"]
diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
new file mode 100644
index 0000000000..972b3f57c5
--- /dev/null
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -0,0 +1,87 @@
+import logging
+from typing import List, Dict, Any, Optional
+
+from haystack.preview import component, default_from_dict, default_to_dict, Document
+from haystack.preview.lazy_imports import LazyImport
+
+logger = logging.getLogger(__name__)
+
+with LazyImport("Run 'pip install langdetect'") as langdetect_import:
+    import langdetect
+
+
+@component
+class DocumentLanguageClassifier:
+    """
+    Routes documents onto different output connections depending on their language.
+    This is useful for routing documents to different models in a pipeline depending on their language.
+    The set of supported languages can be specified.
+    For routing texts based on their language use the related TextLanguageClassifier component.
+
+    Example usage in and indexing pipeline that writes only English language documents to a Store:
+    document_store = MemoryDocumentStore()
+    p = Pipeline()
+    p.add_component(instance=TextFileToDocument(), name="text_file_converter")
+    p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
+    p.add_component(instance=TextDocumentSplitter(), name="splitter")
+    p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
+    p.connect("text_file_converter.documents", "language_classifier.documents")
+    p.connect("language_classifier.documents", "splitter.documents")
+    p.connect("splitter.documents", "writer.documents")
+    """
+
+    def __init__(self, languages: Optional[List[str]] = None):
+        """
+        :param languages: A list of languages in ISO code, each corresponding to a different output connection (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
+        """
+        langdetect_import.check()
+        if not languages:
+            languages = ["en"]
+        self.languages = languages
+        component.set_output_types(self, unmatched=List[str], **{language: List[str] for language in languages})
+
+    def run(self, documents: List[Document]):
+        """
+        Run the DocumentLanguageClassifier. This method routes the documents to different edges based on their language.
+        If a Document's text does not match any of the languages specified at initialization, it is routed to
+        a connection named "unmatched".
+
+        :param documents: A list of documents to route to different edges.
+        """
+        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
+            raise TypeError(
+                "DocumentLanguageClassifier expects a list of Document as input. In case you want to classify a text, please use the TextLanguageClassifier."
+            )
+
+        output: Dict[str, List[Document]] = {language: [] for language in self.languages}
+        output["unmatched"] = []
+
+        for document in documents:
+            detected_language = self.detect_language(document)
+            if detected_language in self.languages:
+                output[detected_language].append(document)
+            else:
+                output["unmatched"].append(document)
+
+        return output
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(self, languages=self.languages)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DocumentLanguageClassifier":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    def detect_language(self, document: Document) -> Optional[str]:
+        try:
+            language = langdetect.detect(document.text)
+        except langdetect.LangDetectException:
+            logger.warning("Langdetect cannot detect the language of Document with id: %s", document.id)
+            language = None
+        return language
diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py
new file mode 100644
index 0000000000..838d437c12
--- /dev/null
+++ b/test/preview/components/preprocessors/test_document_language_classifier.py
@@ -0,0 +1,58 @@
+import logging
+import pytest
+
+from haystack.preview import Document
+from haystack.preview.components.preprocessors import DocumentLanguageClassifier
+
+
+class TestDocumentLanguageClassifier:
+    @pytest.mark.unit
+    def test_to_dict(self):
+        component = DocumentLanguageClassifier(languages=["en", "de"])
+        data = component.to_dict()
+        assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}}
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        data = {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}}
+        component = DocumentLanguageClassifier.from_dict(data)
+        assert component.languages == ["en", "de"]
+
+    @pytest.mark.unit
+    def test_non_document_input(self):
+        with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
+            classifier = DocumentLanguageClassifier()
+            classifier.run(documents="This is an english sentence.")
+
+    @pytest.mark.unit
+    def test_single_document(self):
+        with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
+            classifier = DocumentLanguageClassifier()
+            classifier.run(documents=Document(text="This is an english sentence."))
+
+    @pytest.mark.unit
+    def test_empty_list(self):
+        classifier = DocumentLanguageClassifier()
+        result = classifier.run(documents=[])
+        assert result == {"en": [], "unmatched": []}
+
+    @pytest.mark.unit
+    def test_detect_language(self):
+        classifier = DocumentLanguageClassifier()
+        detected_language = classifier.detect_language(Document(text="This is an english sentence."))
+        assert detected_language == "en"
+
+    @pytest.mark.unit
+    def test_route_to_en_and_unmatched(self):
+        classifier = DocumentLanguageClassifier()
+        english_document = Document(text="This is an english sentence.")
+        german_document = Document(text="Ein deutscher Satz ohne Verb.")
+        result = classifier.run(documents=[english_document, german_document])
+        assert result == {"en": [english_document], "unmatched": [german_document]}
+
+    @pytest.mark.unit
+    def test_warning_if_no_language_detected(self, caplog):
+        with caplog.at_level(logging.WARNING):
+            classifier = DocumentLanguageClassifier()
+            classifier.run(documents=[Document(text=".")])
+            assert "Langdetect cannot detect the language of Document with id" in caplog.text

From 6b0408e93953139bc59704f39e4a817be4c9739c Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 12 Oct 2023 13:46:30 +0200
Subject: [PATCH 02/12] reno

---
 .../notes/document-language-classifier-1ec0b3c4d08989c0.yaml  | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml

diff --git a/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml b/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml
new file mode 100644
index 0000000000..07372290f3
--- /dev/null
+++ b/releasenotes/notes/document-language-classifier-1ec0b3c4d08989c0.yaml
@@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Added DocumentLanguageClassifier component so that Documents can be routed to different components based on the detected language for example during preprocessing.

From 2837d69c8af02e09f7c9e036c338b0c5b6dda9cd Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 13 Oct 2023 09:14:01 +0200
Subject: [PATCH 03/12] fix import, rename DocumentCleaner

---
 e2e/preview/pipelines/test_preprocessing_pipeline.py  |  6 +++---
 .../test_document_language_classifier.py              | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
index b99ec89a56..9dac07b8d1 100644
--- a/e2e/preview/pipelines/test_preprocessing_pipeline.py
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -3,8 +3,8 @@
 from haystack.preview import Pipeline
 from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
 from haystack.preview.components.file_converters import TextFileToDocument
-from haystack.preview.components.preprocessors import TextDocumentSplitter, TextDocumentCleaner
-from haystack.preview.components.routers import FileTypeRouter, DocumentLanguageClassifier
+from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
+from haystack.preview.components.routers import FileTypeRouter
 from haystack.preview.components.writers import DocumentWriter
 from haystack.preview.document_stores import MemoryDocumentStore
 
@@ -16,7 +16,7 @@ def test_preprocessing_pipeline(tmp_path):
     preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")
     preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
     preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
-    preprocessing_pipeline.add_component(instance=TextDocumentCleaner(), name="cleaner")
+    preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
     preprocessing_pipeline.add_component(
         instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter"
     )
diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py
index 838d437c12..a7ab826ac8 100644
--- a/test/preview/components/preprocessors/test_document_language_classifier.py
+++ b/test/preview/components/preprocessors/test_document_language_classifier.py
@@ -6,8 +6,19 @@
 
 
 class TestDocumentLanguageClassifier:
+    @pytest.mark.unit
+    def test_init(self):
+        component = DocumentLanguageClassifier()
+        assert component.languages == ["en"]
+
     @pytest.mark.unit
     def test_to_dict(self):
+        component = DocumentLanguageClassifier()
+        data = component.to_dict()
+        assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en"]}}
+
+    @pytest.mark.unit
+    def test_to_dict_with_custom_init_parameters(self):
         component = DocumentLanguageClassifier(languages=["en", "de"])
         data = component.to_dict()
         assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}}

From 0ad656be9bc3f829740fa17105c7571d91b17c91 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 13 Oct 2023 10:05:07 +0200
Subject: [PATCH 04/12] mark example usage as python code

---
 .../components/preprocessors/document_language_classifier.py   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index 972b3f57c5..f0cfc40189 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -19,6 +19,8 @@ class DocumentLanguageClassifier:
     For routing texts based on their language use the related TextLanguageClassifier component.
 
     Example usage in and indexing pipeline that writes only English language documents to a Store:
+
+    ```python
     document_store = MemoryDocumentStore()
     p = Pipeline()
     p.add_component(instance=TextFileToDocument(), name="text_file_converter")
@@ -28,6 +30,7 @@ class DocumentLanguageClassifier:
     p.connect("text_file_converter.documents", "language_classifier.documents")
     p.connect("language_classifier.documents", "splitter.documents")
     p.connect("splitter.documents", "writer.documents")
+    ```
     """
 
     def __init__(self, languages: Optional[List[str]] = None):

From 6edebc03dbb5769784c48e571957547d7495b7be Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 13 Oct 2023 14:38:05 +0200
Subject: [PATCH 05/12] add assertions to e2e test

---
 .../pipelines/test_preprocessing_pipeline.py     | 16 ++++++++++++++--
 .../document_language_classifier.py              |  4 +++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
index 9dac07b8d1..74f59724c8 100644
--- a/e2e/preview/pipelines/test_preprocessing_pipeline.py
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -27,7 +27,7 @@ def test_preprocessing_pipeline(tmp_path):
     preprocessing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
     preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.paths")
     preprocessing_pipeline.connect("text_file_converter.documents", "language_classifier.documents")
-    preprocessing_pipeline.connect("language_classifier.documents", "cleaner.documents")
+    preprocessing_pipeline.connect("language_classifier.en", "cleaner.documents")
     preprocessing_pipeline.connect("cleaner.documents", "splitter.documents")
     preprocessing_pipeline.connect("splitter.documents", "embedder.documents")
     preprocessing_pipeline.connect("embedder.documents", "writer.documents")
@@ -66,6 +66,18 @@ def test_preprocessing_pipeline(tmp_path):
 
     result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})
 
-    # TODO Add more assertions
     assert result["writer"]["documents_written"] == 6
     assert document_store.count_documents() == 6
+
+    # Check preprocessed texts and mime_types
+    stored_documents = document_store.filter_documents()
+    expected_texts = [
+        "This is an english sentence.",
+        " There is more to it.",
+        " It's a long text.",
+        "Spans multiple lines.",
+        "Even contains empty lines.",
+        " And extra whitespaces.",
+    ]
+    assert expected_texts == [document.text for document in stored_documents]
+    assert all(document.mime_type == "text/plain" for document in stored_documents)
diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index f0cfc40189..e7edde1023 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -41,7 +41,9 @@ def __init__(self, languages: Optional[List[str]] = None):
         if not languages:
             languages = ["en"]
         self.languages = languages
-        component.set_output_types(self, unmatched=List[str], **{language: List[str] for language in languages})
+        component.set_output_types(
+            self, unmatched=List[Document], **{language: List[Document] for language in languages}
+        )
 
     def run(self, documents: List[Document]):
         """

From 621823ba608990dcb15c1af891b35867c50caed6 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Tue, 17 Oct 2023 09:23:40 +0200
Subject: [PATCH 06/12] use deserialized document_store

---
 e2e/preview/pipelines/test_preprocessing_pipeline.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
index 74f59724c8..dc21d1b843 100644
--- a/e2e/preview/pipelines/test_preprocessing_pipeline.py
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -67,10 +67,11 @@ def test_preprocessing_pipeline(tmp_path):
     result = preprocessing_pipeline.run({"file_type_router": {"sources": paths}})
 
     assert result["writer"]["documents_written"] == 6
-    assert document_store.count_documents() == 6
+    filled_document_store = preprocessing_pipeline.get_component("writer").document_store
+    assert filled_document_store.count_documents() == 6
 
     # Check preprocessed texts and mime_types
-    stored_documents = document_store.filter_documents()
+    stored_documents = filled_document_store.filter_documents()
     expected_texts = [
         "This is an english sentence.",
         " There is more to it.",

From 470d0268fb3abed43fc0135264258f9d282a46c2 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Wed, 18 Oct 2023 13:06:54 +0200
Subject: [PATCH 07/12] Apply suggestions from code review

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
---
 .../components/preprocessors/document_language_classifier.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index e7edde1023..297216ea07 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -16,9 +16,9 @@ class DocumentLanguageClassifier:
     Routes documents onto different output connections depending on their language.
     This is useful for routing documents to different models in a pipeline depending on their language.
     The set of supported languages can be specified.
-    For routing texts based on their language use the related TextLanguageClassifier component.
+    For routing plain text using the same logic, use the related TextLanguageClassifier component instead.
 
-    Example usage in and indexing pipeline that writes only English language documents to a Store:
+    Example usage within an indexing pipeline, storing only documents written in English language documents to a Store:
 
     ```python
     document_store = MemoryDocumentStore()

From 32654b14e4a4cb652fe5d3408d022f8bd2bea5d8 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Wed, 18 Oct 2023 13:10:48 +0200
Subject: [PATCH 08/12] remove from/to_dict

---
 .../document_language_classifier.py           | 21 +++----------------
 .../test_document_language_classifier.py      | 18 ----------------
 2 files changed, 3 insertions(+), 36 deletions(-)

diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index 297216ea07..6d3e55038d 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -1,7 +1,7 @@
 import logging
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Optional
 
-from haystack.preview import component, default_from_dict, default_to_dict, Document
+from haystack.preview import component, Document
 from haystack.preview.lazy_imports import LazyImport
 
 logger = logging.getLogger(__name__)
@@ -25,11 +25,9 @@ class DocumentLanguageClassifier:
     p = Pipeline()
     p.add_component(instance=TextFileToDocument(), name="text_file_converter")
     p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
-    p.add_component(instance=TextDocumentSplitter(), name="splitter")
     p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
     p.connect("text_file_converter.documents", "language_classifier.documents")
-    p.connect("language_classifier.documents", "splitter.documents")
-    p.connect("splitter.documents", "writer.documents")
+    p.connect("language_classifier.en", "writer.documents")
     ```
     """
 
@@ -70,19 +68,6 @@ def run(self, documents: List[Document]):
 
         return output
 
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serialize this component to a dictionary.
-        """
-        return default_to_dict(self, languages=self.languages)
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "DocumentLanguageClassifier":
-        """
-        Deserialize this component from a dictionary.
-        """
-        return default_from_dict(cls, data)
-
     def detect_language(self, document: Document) -> Optional[str]:
         try:
             language = langdetect.detect(document.text)
diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py
index a7ab826ac8..88e0ba88c7 100644
--- a/test/preview/components/preprocessors/test_document_language_classifier.py
+++ b/test/preview/components/preprocessors/test_document_language_classifier.py
@@ -11,24 +11,6 @@ def test_init(self):
         component = DocumentLanguageClassifier()
         assert component.languages == ["en"]
 
-    @pytest.mark.unit
-    def test_to_dict(self):
-        component = DocumentLanguageClassifier()
-        data = component.to_dict()
-        assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en"]}}
-
-    @pytest.mark.unit
-    def test_to_dict_with_custom_init_parameters(self):
-        component = DocumentLanguageClassifier(languages=["en", "de"])
-        data = component.to_dict()
-        assert data == {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}}
-
-    @pytest.mark.unit
-    def test_from_dict(self):
-        data = {"type": "DocumentLanguageClassifier", "init_parameters": {"languages": ["en", "de"]}}
-        component = DocumentLanguageClassifier.from_dict(data)
-        assert component.languages == ["en", "de"]
-
     @pytest.mark.unit
     def test_non_document_input(self):
         with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):

From d9e1fd9b4d4e8ed7d385772e157deb471159e8e9 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Wed, 18 Oct 2023 13:39:09 +0200
Subject: [PATCH 09/12] use renamed InMemoryDocumentStore

---
 e2e/preview/pipelines/test_preprocessing_pipeline.py          | 4 ++--
 .../components/preprocessors/document_language_classifier.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
index dc21d1b843..ec6d5356e6 100644
--- a/e2e/preview/pipelines/test_preprocessing_pipeline.py
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -6,12 +6,12 @@
 from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
 from haystack.preview.components.routers import FileTypeRouter
 from haystack.preview.components.writers import DocumentWriter
-from haystack.preview.document_stores import MemoryDocumentStore
+from haystack.preview.document_stores import InMemoryDocumentStore
 
 
 def test_preprocessing_pipeline(tmp_path):
     # Create the pipeline and its components
-    document_store = MemoryDocumentStore()
+    document_store = InMemoryDocumentStore()
     preprocessing_pipeline = Pipeline()
     preprocessing_pipeline.add_component(instance=FileTypeRouter(mime_types=["text/plain"]), name="file_type_router")
     preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index 6d3e55038d..72b4ed11c6 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -21,7 +21,7 @@ class DocumentLanguageClassifier:
     Example usage within an indexing pipeline, storing only documents written in English language documents to a Store:
 
     ```python
-    document_store = MemoryDocumentStore()
+    document_store = InMemoryDocumentStore()
     p = Pipeline()
     p.add_component(instance=TextFileToDocument(), name="text_file_converter")
     p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")

From 322d74a8d924c8a0b1e736c331fda1ae4babed70 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Tue, 31 Oct 2023 14:55:05 +0100
Subject: [PATCH 10/12] adapt to Document refactoring

---
 e2e/preview/pipelines/test_preprocessing_pipeline.py     | 2 +-
 .../preprocessors/document_language_classifier.py        | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py
index ec6d5356e6..5b32c642a4 100644
--- a/e2e/preview/pipelines/test_preprocessing_pipeline.py
+++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py
@@ -80,5 +80,5 @@ def test_preprocessing_pipeline(tmp_path):
         "Even contains empty lines.",
         " And extra whitespaces.",
     ]
-    assert expected_texts == [document.text for document in stored_documents]
+    assert expected_texts == [document.content for document in stored_documents]
     assert all(document.mime_type == "text/plain" for document in stored_documents)
diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index 72b4ed11c6..b3707e2295 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -33,7 +33,9 @@ class DocumentLanguageClassifier:
 
     def __init__(self, languages: Optional[List[str]] = None):
         """
-        :param languages: A list of languages in ISO code, each corresponding to a different output connection (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
+        :param languages: A list of languages in ISO code, each corresponding to a different output connection
+            (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
+            By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
         """
         langdetect_import.check()
         if not languages:
@@ -53,7 +55,8 @@ def run(self, documents: List[Document]):
         """
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError(
-                "DocumentLanguageClassifier expects a list of Document as input. In case you want to classify a text, please use the TextLanguageClassifier."
+                "DocumentLanguageClassifier expects a list of Document as input. "
+                "In case you want to classify a text, please use the TextLanguageClassifier."
             )
 
         output: Dict[str, List[Document]] = {language: [] for language in self.languages}
@@ -70,7 +73,7 @@ def run(self, documents: List[Document]):
 
     def detect_language(self, document: Document) -> Optional[str]:
         try:
-            language = langdetect.detect(document.text)
+            language = langdetect.detect(document.content)
         except langdetect.LangDetectException:
             logger.warning("Langdetect cannot detect the language of Document with id: %s", document.id)
             language = None

From 8e33dbc966528076c8315a23f0bff4bdb812fa59 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Tue, 31 Oct 2023 15:00:52 +0100
Subject: [PATCH 11/12] improve docstring

---
 .../components/preprocessors/document_language_classifier.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/haystack/preview/components/preprocessors/document_language_classifier.py b/haystack/preview/components/preprocessors/document_language_classifier.py
index b3707e2295..98198cca12 100644
--- a/haystack/preview/components/preprocessors/document_language_classifier.py
+++ b/haystack/preview/components/preprocessors/document_language_classifier.py
@@ -18,7 +18,8 @@ class DocumentLanguageClassifier:
     The set of supported languages can be specified.
     For routing plain text using the same logic, use the related TextLanguageClassifier component instead.
 
-    Example usage within an indexing pipeline, storing only documents written in English language documents to a Store:
+    Example usage within an indexing pipeline, storing in a Document Store
+    only documents written in English:
 
     ```python
     document_store = InMemoryDocumentStore()

From 0cb60028b8191ae6c0754c580beaa9bed2ac95fd Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Tue, 31 Oct 2023 15:14:26 +0100
Subject: [PATCH 12/12] fix test for new Document

---
 .../preprocessors/test_document_language_classifier.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/preview/components/preprocessors/test_document_language_classifier.py b/test/preview/components/preprocessors/test_document_language_classifier.py
index 88e0ba88c7..e666649cec 100644
--- a/test/preview/components/preprocessors/test_document_language_classifier.py
+++ b/test/preview/components/preprocessors/test_document_language_classifier.py
@@ -21,7 +21,7 @@ def test_non_document_input(self):
     def test_single_document(self):
         with pytest.raises(TypeError, match="DocumentLanguageClassifier expects a list of Document as input."):
             classifier = DocumentLanguageClassifier()
-            classifier.run(documents=Document(text="This is an english sentence."))
+            classifier.run(documents=Document(content="This is an english sentence."))
 
     @pytest.mark.unit
     def test_empty_list(self):
@@ -32,14 +32,14 @@ def test_empty_list(self):
     @pytest.mark.unit
     def test_detect_language(self):
         classifier = DocumentLanguageClassifier()
-        detected_language = classifier.detect_language(Document(text="This is an english sentence."))
+        detected_language = classifier.detect_language(Document(content="This is an english sentence."))
         assert detected_language == "en"
 
     @pytest.mark.unit
     def test_route_to_en_and_unmatched(self):
         classifier = DocumentLanguageClassifier()
-        english_document = Document(text="This is an english sentence.")
-        german_document = Document(text="Ein deutscher Satz ohne Verb.")
+        english_document = Document(content="This is an english sentence.")
+        german_document = Document(content="Ein deutscher Satz ohne Verb.")
         result = classifier.run(documents=[english_document, german_document])
         assert result == {"en": [english_document], "unmatched": [german_document]}
 
@@ -47,5 +47,5 @@ def test_route_to_en_and_unmatched(self):
     def test_warning_if_no_language_detected(self, caplog):
         with caplog.at_level(logging.WARNING):
             classifier = DocumentLanguageClassifier()
-            classifier.run(documents=[Document(text=".")])
+            classifier.run(documents=[Document(content=".")])
             assert "Langdetect cannot detect the language of Document with id" in caplog.text