Standardize TextFileToDocument (#6232)

* simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests
deepset-ai · Nov 17, 2023 · e888852 · e888852
1 parent c26a932
commit e888852
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 402 deletions.
diff --git a/haystack/preview/components/file_converters/txt.py b/haystack/preview/components/file_converters/txt.py
@@ -1,15 +1,9 @@
 import logging
 from pathlib import Path
-from typing import Optional, List, Union, Dict
+from typing import List, Union
 
-from canals.errors import PipelineRuntimeError
-from tqdm import tqdm
-
-from haystack.preview.lazy_imports import LazyImport
 from haystack.preview import Document, component
-
-with LazyImport("Run 'pip install langdetect'") as langdetect_import:
-    import langdetect
+from haystack.preview.dataclasses import ByteStream
 
 
 logger = logging.getLogger(__name__)
@@ -21,189 +15,42 @@ class TextFileToDocument:
     A component for converting a text file to a Document.
     """
 
-    def __init__(
-        self,
-        encoding: str = "utf-8",
-        remove_numeric_tables: bool = False,
-        numeric_row_threshold: float = 0.4,
-        valid_languages: Optional[List[str]] = None,
-        progress_bar: bool = True,
-    ):
+    def __init__(self, encoding: str = "utf-8"):
         """
         Create a TextFileToDocument component.
 
-        :param encoding: The encoding of the text files. Default: `"utf-8"`
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-            The tabular structures in documents might be noise for reader models if they don't have table parsing
-            capability for finding answers. However, tables may also have long strings that could be possible candidates
-            for answers. The rows containing strings are thus retained in this option. Default: `False`
-        :param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
-            determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
-            numeric words to the total number of words in a line. Default: `0.4`
-        :param valid_languages: Validate languages from a list of languages specified in the
-            [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
-            encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
-            error resulting in garbled text. Default: `None`
-        :param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
+        :param encoding: The default encoding of the text files. Default: `"utf-8"`.
+            Note that if the encoding is specified in the metadata of a ByteStream,
+            it will override this default.
         """
-        langdetect_import.check()
-
         self.encoding = encoding
-        self.remove_numeric_tables = remove_numeric_tables
-        self.numeric_row_threshold = numeric_row_threshold
-        self.valid_languages = valid_languages or []
-        self.progress_bar = progress_bar
 
     @component.output_types(documents=List[Document])
-    def run(
-        self,
-        paths: List[Union[str, Path]],
-        metadata: Optional[Union[Dict, List[Dict]]] = None,
-        encoding: Optional[str] = None,
-        remove_numeric_tables: Optional[bool] = None,
-        numeric_row_threshold: Optional[float] = None,
-        valid_languages: Optional[List[str]] = None,
-        progress_bar: Optional[bool] = None,
-    ):
+    def run(self, sources: List[Union[str, Path, ByteStream]]):
         """
         Convert text files to Documents.
 
-        :param paths: A list of paths to text files.
-        :param metadata: Optional metadata to attach to the Documents. If a list is provided, the length of the list
-            must match the number of paths. Default: `None`
-        :param encoding: The encoding of the text files. Default: `"utf-8"`
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-            The tabular structures in documents might be noise for reader models if they don't have table parsing
-            capability for finding answers. However, tables may also have long strings that could be possible candidates
-            for answers. The rows containing strings are thus retained in this option. Default: `False`
-        :param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
-            determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
-            numeric words to the total number of words in a line. Default: `0.4`
-        :param valid_languages: Validate languages from a list of languages specified in the
-            [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
-            encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
-            error resulting in garbled text. Default: `None`
-        :param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
+        :param streams: A list of paths to text files or ByteStream objects.
+            Note that if an encoding is specified in the metadata of a ByteStream,
+            it will override the component's default.
+        :return: A dictionary containing the converted documents.
         """
-        if encoding is None:
-            encoding = self.encoding
-        if remove_numeric_tables is None:
-            remove_numeric_tables = self.remove_numeric_tables
-        if numeric_row_threshold is None:
-            numeric_row_threshold = self.numeric_row_threshold
-        if valid_languages is None:
-            valid_languages = self.valid_languages
-        if progress_bar is None:
-            progress_bar = self.progress_bar
-
-        metas = TextFileToDocument._prepare_metadata(metadata, paths)
-
         documents = []
-        for path, meta in tqdm(
-            zip(paths, metas), total=len(paths), desc="Converting text files", disable=not progress_bar
-        ):
+        for source in sources:
+            if isinstance(source, (Path, str)):
+                try:
+                    path = source
+                    source = ByteStream.from_file_path(Path(source))
+                    source.metadata["file_path"] = str(path)
+                except Exception as e:
+                    logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
+                    continue
             try:
-                text = self._read_and_clean_file(
-                    path=path, encoding=encoding, remove_numeric_tables=remove_numeric_tables
-                )
+                encoding = source.metadata.get("encoding", self.encoding)
+                document = Document(content=source.data.decode(encoding))
+                document.meta = source.metadata
+                documents.append(document)
             except Exception as e:
-                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
-                continue
-
-            if valid_languages is not None and not TextFileToDocument._validate_language(text, valid_languages):
-                logger.warning(
-                    "Text from file %s is not in one of the valid languages: %s. "
-                    "The file may have been decoded incorrectly.",
-                    path,
-                    valid_languages,
-                )
-
-            document = Document(content=text, meta=meta)
-            documents.append(document)
+                logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
 
         return {"documents": documents}
-
-    @staticmethod
-    def _prepare_metadata(metadata: Optional[Union[Dict, List[Dict]]], paths: List[Union[str, Path]]) -> List[Dict]:
-        """
-        Prepare the metadata for the Documents.
-
-        :param metadata: The metadata for the Documents.
-        :param paths: The paths to the text files.
-        """
-        if metadata is None:
-            return [{"file_path": str(path)} for path in paths]
-
-        if isinstance(metadata, dict):
-            metadata = [metadata] * len(paths)
-
-        if len(metadata) != len(paths):
-            raise PipelineRuntimeError(
-                f"The number of metadata entries must match the number of paths if metadata is a list. "
-                f"Number of paths: {len(paths)}, number of metadata entries: {len(metadata)}."
-            )
-
-        return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(metadata, paths)]
-
-    def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
-        """
-        Read and clean the text file.
-
-        :param path: The path to the text file.
-        :param encoding: The encoding of the text file.
-        :param remove_numeric_tables: Whether to remove numeric tables.
-
-        :return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
-        """
-        if not Path(path).exists():
-            raise PipelineRuntimeError(f"File at path {path} does not exist.")
-
-        with open(path, encoding=encoding) as file:
-            text = file.read()
-            pages = text.split("\f")
-            cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
-            return "\f".join(cleaned_pages)
-
-    def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
-        """
-        Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
-
-        :param page: The content of a page of a text file.
-        :param remove_numeric_tables: Whether to remove numeric tables.
-
-        :return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
-        """
-        cleaned_lines = page.splitlines()
-        if remove_numeric_tables:
-            cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
-
-        return "\n".join(cleaned_lines)
-
-    def _is_numeric_row(self, line: str) -> bool:
-        """
-        Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
-        than 40% digits and does not end with a period.
-
-        :param line: The content of a line of a text file.
-        """
-        words = line.split()
-        digits = [word for word in words if any(char.isdigit() for char in word)]
-        return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
-
-    @staticmethod
-    def _validate_language(text: str, valid_languages: List[str]) -> bool:
-        """
-        Validate if the detected language of the text is one of the valid languages.
-
-        :param text: The text to validate.
-        :param valid_languages: A list of valid languages.
-        """
-        if not valid_languages:
-            return True
-
-        try:
-            lang = langdetect.detect(text)
-        except langdetect.lang_detect_exception.LangDetectException:
-            lang = None
-
-        return lang in valid_languages
diff --git a/releasenotes/notes/simplify-textfiletodocument-c9d2fb7ed2c848ed.yaml b/releasenotes/notes/simplify-textfiletodocument-c9d2fb7ed2c848ed.yaml
@@ -0,0 +1,3 @@
+preview:
+  - Remove most parameters from TextFileToDocument to make it match all other converters.
+  - Add support for ByteStreams