docs: review and normalize haystack.components.fetchers (#7232)

* docs: review and normalize `haystack.components.fetchers` * docs: drop defaults
deepset-ai · Feb 28, 2024 · ac4f458 · ac4f458
1 parent 8549143
commit ac4f458
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 31 deletions.
diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py
@@ -25,15 +25,15 @@
 }
 
 
-def text_content_handler(response: Response) -> ByteStream:
+def _text_content_handler(response: Response) -> ByteStream:
     """
     :param response: Response object from the request.
     :return: The extracted text.
     """
     return ByteStream.from_string(response.text)
 
 
-def binary_content_handler(response: Response) -> ByteStream:
+def _binary_content_handler(response: Response) -> ByteStream:
     """
     :param response: Response object from the request.
     :return: The extracted binary file-like object.
@@ -44,8 +44,22 @@ def binary_content_handler(response: Response) -> ByteStream:
 @component
 class LinkContentFetcher:
     """
-    LinkContentFetcher is a component for fetching and extracting content from URLs. It supports handling various
-    content types, retries on failures, and automatic user-agent rotation for failed web requests.
+    LinkContentFetcher is a component for fetching and extracting content from URLs.
+
+    It supports handling various content types, retries on failures, and automatic user-agent rotation for failed web
+    requests.
+
+    Usage example:
+    ```python
+    from haystack.components.fetchers.link_content import LinkContentFetcher
+
+    fetcher = LinkContentFetcher()
+    streams = fetcher.run(urls=["https://www.google.com"])["streams"]
+
+    assert len(streams) == 1
+    assert streams[0].meta == {'content_type': 'text/html', 'url': 'https://www.google.com'}
+    assert streams[0].data
+    ```
     """
 
     def __init__(
@@ -56,13 +70,14 @@ def __init__(
         timeout: int = 3,
     ):
         """
-        Initializes a LinkContentFetcher instance.
-
-        :param raise_on_failure: If True, raises an exception if it fails to fetch a single URL.
-            For multiple URLs, it logs errors and returns the content it successfully fetched. Default is True.
-        :param user_agents: A list of user agents for fetching content. If None, a default user agent is used.
-        :param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is 2.
-        :param timeout: Timeout in seconds for the request. Default is 3.
+        Initializes the component.
+
+        :param raise_on_failure: If `True`, raises an exception if it fails to fetch a single URL.
+            For multiple URLs, it logs errors and returns the content it successfully fetched.
+        :param user_agents: [User agents](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent)
+            for fetching content. If `None`, a default user agent is used.
+        :param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content.
+        :param timeout: Timeout in seconds for the request.
         """
         self.raise_on_failure = raise_on_failure
         self.user_agents = user_agents or [DEFAULT_USER_AGENT]
@@ -71,11 +86,11 @@ def __init__(
         self.timeout = timeout
 
         # register default content handlers that extract data from the response
-        self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: text_content_handler)
-        self.handlers["text/html"] = text_content_handler
-        self.handlers["text/plain"] = text_content_handler
-        self.handlers["application/pdf"] = binary_content_handler
-        self.handlers["application/octet-stream"] = binary_content_handler
+        self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
+        self.handlers["text/html"] = _text_content_handler
+        self.handlers["text/plain"] = _text_content_handler
+        self.handlers["application/pdf"] = _binary_content_handler
+        self.handlers["application/octet-stream"] = _binary_content_handler
 
         @retry(
             reraise=True,
@@ -99,25 +114,27 @@ def get_response(url):
     def run(self, urls: List[str]):
         """
         Fetches content from a list of URLs and returns a list of extracted content streams.
-        Each content stream is a ByteStream object containing the extracted content as binary data.
+
+        Each content stream is a `ByteStream` object containing the extracted content as binary data.
         Each ByteStream object in the returned list corresponds to the contents of a single URL.
         The content type of each stream is stored in the metadata of the ByteStream object under
         the key "content_type". The URL of the fetched content is stored under the key "url".
 
         :param urls: A list of URLs to fetch content from.
-        :return: A lists of ByteStream objects representing the extracted content.
+        :return: `ByteStream` objects representing the extracted content.
 
-        :raises: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to True,
-        an exception will be raised in case of an error during content retrieval. In all other scenarios, any
-        retrieval errors are logged, and a list of successfully retrieved ByteStream objects is returned.
+        :raises Exception: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to
+            `True`, an exception will be raised in case of an error during content retrieval.
+            In all other scenarios, any retrieval errors are logged, and a list of successfully retrieved `ByteStream`
+             objects is returned.
         """
         streams: List[ByteStream] = []
         if not urls:
             return {"streams": streams}
 
         # don't use multithreading if there's only one URL
         if len(urls) == 1:
-            stream_metadata, stream = self.fetch(urls[0])
+            stream_metadata, stream = self._fetch(urls[0])
             stream.meta.update(stream_metadata)
             streams.append(stream)
         else:
@@ -131,7 +148,7 @@ def run(self, urls: List[str]):
 
         return {"streams": streams}
 
-    def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
+    def _fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
         """
         Fetches content from a URL and returns it as a ByteStream.
 
@@ -175,12 +192,12 @@ def _fetch_with_exception_suppression(self, url: str) -> Tuple[Optional[Dict[str
         """
         if self.raise_on_failure:
             try:
-                return self.fetch(url)
+                return self._fetch(url)
             except Exception as e:
                 logger.warning("Error fetching %s: %s", url, str(e))
                 return {"content_type": "Unknown", "url": url}, None
         else:
-            return self.fetch(url)
+            return self._fetch(url)
 
     def _get_content_type(self, response: Response):
         """

diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py
@@ -5,8 +5,8 @@
 
 from haystack.components.fetchers.link_content import (
     LinkContentFetcher,
-    text_content_handler,
-    binary_content_handler,
+    _text_content_handler,
+    _binary_content_handler,
     DEFAULT_USER_AGENT,
 )
 
@@ -43,10 +43,10 @@ def test_init(self):
         assert fetcher.retry_attempts == 2
         assert fetcher.timeout == 3
         assert fetcher.handlers == {
-            "text/html": text_content_handler,
-            "text/plain": text_content_handler,
-            "application/pdf": binary_content_handler,
-            "application/octet-stream": binary_content_handler,
+            "text/html": _text_content_handler,
+            "text/plain": _text_content_handler,
+            "application/pdf": _binary_content_handler,
+            "application/octet-stream": _binary_content_handler,
         }
         assert hasattr(fetcher, "_get_response")