Skip to content

Commit

Permalink
docs: review and normalize haystack.components.fetchers (#7232)
Browse files Browse the repository at this point in the history
* docs: review and normalize `haystack.components.fetchers`

* docs: drop defaults
  • Loading branch information
wochinge authored Feb 28, 2024
1 parent 8549143 commit ac4f458
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 31 deletions.
67 changes: 42 additions & 25 deletions haystack/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
}


def text_content_handler(response: Response) -> ByteStream:
def _text_content_handler(response: Response) -> ByteStream:
"""
:param response: Response object from the request.
:return: The extracted text.
"""
return ByteStream.from_string(response.text)


def binary_content_handler(response: Response) -> ByteStream:
def _binary_content_handler(response: Response) -> ByteStream:
"""
:param response: Response object from the request.
:return: The extracted binary file-like object.
Expand All @@ -44,8 +44,22 @@ def binary_content_handler(response: Response) -> ByteStream:
@component
class LinkContentFetcher:
"""
LinkContentFetcher is a component for fetching and extracting content from URLs. It supports handling various
content types, retries on failures, and automatic user-agent rotation for failed web requests.
LinkContentFetcher is a component for fetching and extracting content from URLs.
It supports handling various content types, retries on failures, and automatic user-agent rotation for failed web
requests.
Usage example:
```python
from haystack.components.fetchers.link_content import LinkContentFetcher
fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.google.com"])["streams"]
assert len(streams) == 1
assert streams[0].meta == {'content_type': 'text/html', 'url': 'https://www.google.com'}
assert streams[0].data
```
"""

def __init__(
Expand All @@ -56,13 +70,14 @@ def __init__(
timeout: int = 3,
):
"""
Initializes a LinkContentFetcher instance.
:param raise_on_failure: If True, raises an exception if it fails to fetch a single URL.
For multiple URLs, it logs errors and returns the content it successfully fetched. Default is True.
:param user_agents: A list of user agents for fetching content. If None, a default user agent is used.
:param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is 2.
:param timeout: Timeout in seconds for the request. Default is 3.
Initializes the component.
:param raise_on_failure: If `True`, raises an exception if it fails to fetch a single URL.
For multiple URLs, it logs errors and returns the content it successfully fetched.
:param user_agents: [User agents](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent)
for fetching content. If `None`, a default user agent is used.
:param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content.
:param timeout: Timeout in seconds for the request.
"""
self.raise_on_failure = raise_on_failure
self.user_agents = user_agents or [DEFAULT_USER_AGENT]
Expand All @@ -71,11 +86,11 @@ def __init__(
self.timeout = timeout

# register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: text_content_handler)
self.handlers["text/html"] = text_content_handler
self.handlers["text/plain"] = text_content_handler
self.handlers["application/pdf"] = binary_content_handler
self.handlers["application/octet-stream"] = binary_content_handler
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
self.handlers["text/html"] = _text_content_handler
self.handlers["text/plain"] = _text_content_handler
self.handlers["application/pdf"] = _binary_content_handler
self.handlers["application/octet-stream"] = _binary_content_handler

@retry(
reraise=True,
Expand All @@ -99,25 +114,27 @@ def get_response(url):
def run(self, urls: List[str]):
"""
Fetches content from a list of URLs and returns a list of extracted content streams.
Each content stream is a ByteStream object containing the extracted content as binary data.
Each content stream is a `ByteStream` object containing the extracted content as binary data.
Each ByteStream object in the returned list corresponds to the contents of a single URL.
The content type of each stream is stored in the metadata of the ByteStream object under
the key "content_type". The URL of the fetched content is stored under the key "url".
:param urls: A list of URLs to fetch content from.
:return: A lists of ByteStream objects representing the extracted content.
:return: `ByteStream` objects representing the extracted content.
:raises: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to True,
an exception will be raised in case of an error during content retrieval. In all other scenarios, any
retrieval errors are logged, and a list of successfully retrieved ByteStream objects is returned.
:raises Exception: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to
`True`, an exception will be raised in case of an error during content retrieval.
In all other scenarios, any retrieval errors are logged, and a list of successfully retrieved `ByteStream`
objects is returned.
"""
streams: List[ByteStream] = []
if not urls:
return {"streams": streams}

# don't use multithreading if there's only one URL
if len(urls) == 1:
stream_metadata, stream = self.fetch(urls[0])
stream_metadata, stream = self._fetch(urls[0])
stream.meta.update(stream_metadata)
streams.append(stream)
else:
Expand All @@ -131,7 +148,7 @@ def run(self, urls: List[str]):

return {"streams": streams}

def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
def _fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
"""
Fetches content from a URL and returns it as a ByteStream.
Expand Down Expand Up @@ -175,12 +192,12 @@ def _fetch_with_exception_suppression(self, url: str) -> Tuple[Optional[Dict[str
"""
if self.raise_on_failure:
try:
return self.fetch(url)
return self._fetch(url)
except Exception as e:
logger.warning("Error fetching %s: %s", url, str(e))
return {"content_type": "Unknown", "url": url}, None
else:
return self.fetch(url)
return self._fetch(url)

def _get_content_type(self, response: Response):
"""
Expand Down
12 changes: 6 additions & 6 deletions test/components/fetchers/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from haystack.components.fetchers.link_content import (
LinkContentFetcher,
text_content_handler,
binary_content_handler,
_text_content_handler,
_binary_content_handler,
DEFAULT_USER_AGENT,
)

Expand Down Expand Up @@ -43,10 +43,10 @@ def test_init(self):
assert fetcher.retry_attempts == 2
assert fetcher.timeout == 3
assert fetcher.handlers == {
"text/html": text_content_handler,
"text/plain": text_content_handler,
"application/pdf": binary_content_handler,
"application/octet-stream": binary_content_handler,
"text/html": _text_content_handler,
"text/plain": _text_content_handler,
"application/pdf": _binary_content_handler,
"application/octet-stream": _binary_content_handler,
}
assert hasattr(fetcher, "_get_response")

Expand Down

0 comments on commit ac4f458

Please sign in to comment.