Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: review and normalize haystack.components.fetchers #7232

Merged
merged 2 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 42 additions & 25 deletions haystack/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
}


def text_content_handler(response: Response) -> ByteStream:
def _text_content_handler(response: Response) -> ByteStream:
"""
:param response: Response object from the request.
:return: The extracted text.
"""
return ByteStream.from_string(response.text)


def binary_content_handler(response: Response) -> ByteStream:
def _binary_content_handler(response: Response) -> ByteStream:
"""
:param response: Response object from the request.
:return: The extracted binary file-like object.
Expand All @@ -44,8 +44,22 @@ def binary_content_handler(response: Response) -> ByteStream:
@component
class LinkContentFetcher:
"""
LinkContentFetcher is a component for fetching and extracting content from URLs. It supports handling various
content types, retries on failures, and automatic user-agent rotation for failed web requests.
LinkContentFetcher is a component for fetching and extracting content from URLs.

It supports handling various content types, retries on failures, and automatic user-agent rotation for failed web
requests.

Usage example:
```python
from haystack.components.fetchers.link_content import LinkContentFetcher

fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.google.com"])["streams"]

assert len(streams) == 1
assert streams[0].meta == {'content_type': 'text/html', 'url': 'https://www.google.com'}
assert streams[0].data
```
"""

def __init__(
Expand All @@ -56,13 +70,14 @@ def __init__(
timeout: int = 3,
):
"""
Initializes a LinkContentFetcher instance.

:param raise_on_failure: If True, raises an exception if it fails to fetch a single URL.
For multiple URLs, it logs errors and returns the content it successfully fetched. Default is True.
:param user_agents: A list of user agents for fetching content. If None, a default user agent is used.
:param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is 2.
:param timeout: Timeout in seconds for the request. Default is 3.
Initializes the component.

:param raise_on_failure: If `True`, raises an exception if it fails to fetch a single URL.
For multiple URLs, it logs errors and returns the content it successfully fetched. Default is `True`.
wochinge marked this conversation as resolved.
Show resolved Hide resolved
:param user_agents: [User agents](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent)
for fetching content. If `None`, a default user agent is used.
:param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is `2`.
:param timeout: Timeout in seconds for the request. Default is `3`.
"""
self.raise_on_failure = raise_on_failure
self.user_agents = user_agents or [DEFAULT_USER_AGENT]
Expand All @@ -71,11 +86,11 @@ def __init__(
self.timeout = timeout

# register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: text_content_handler)
self.handlers["text/html"] = text_content_handler
self.handlers["text/plain"] = text_content_handler
self.handlers["application/pdf"] = binary_content_handler
self.handlers["application/octet-stream"] = binary_content_handler
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
self.handlers["text/html"] = _text_content_handler
self.handlers["text/plain"] = _text_content_handler
self.handlers["application/pdf"] = _binary_content_handler
self.handlers["application/octet-stream"] = _binary_content_handler

@retry(
reraise=True,
Expand All @@ -99,25 +114,27 @@ def get_response(url):
def run(self, urls: List[str]):
"""
Fetches content from a list of URLs and returns a list of extracted content streams.
Each content stream is a ByteStream object containing the extracted content as binary data.

Each content stream is a `ByteStream` object containing the extracted content as binary data.
Each ByteStream object in the returned list corresponds to the contents of a single URL.
The content type of each stream is stored in the metadata of the ByteStream object under
the key "content_type". The URL of the fetched content is stored under the key "url".

:param urls: A list of URLs to fetch content from.
:return: A lists of ByteStream objects representing the extracted content.
:return: `ByteStream` objects representing the extracted content.

:raises: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to True,
an exception will be raised in case of an error during content retrieval. In all other scenarios, any
retrieval errors are logged, and a list of successfully retrieved ByteStream objects is returned.
:raises Exception: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to
`True`, an exception will be raised in case of an error during content retrieval.
In all other scenarios, any retrieval errors are logged, and a list of successfully retrieved `ByteStream`
objects is returned.
"""
streams: List[ByteStream] = []
if not urls:
return {"streams": streams}

# don't use multithreading if there's only one URL
if len(urls) == 1:
stream_metadata, stream = self.fetch(urls[0])
stream_metadata, stream = self._fetch(urls[0])
stream.meta.update(stream_metadata)
streams.append(stream)
else:
Expand All @@ -131,7 +148,7 @@ def run(self, urls: List[str]):

return {"streams": streams}

def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
def _fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]:
"""
Fetches content from a URL and returns it as a ByteStream.

Expand Down Expand Up @@ -175,12 +192,12 @@ def _fetch_with_exception_suppression(self, url: str) -> Tuple[Optional[Dict[str
"""
if self.raise_on_failure:
try:
return self.fetch(url)
return self._fetch(url)
except Exception as e:
logger.warning("Error fetching %s: %s", url, str(e))
return {"content_type": "Unknown", "url": url}, None
else:
return self.fetch(url)
return self._fetch(url)

def _get_content_type(self, response: Response):
"""
Expand Down
12 changes: 6 additions & 6 deletions test/components/fetchers/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from haystack.components.fetchers.link_content import (
LinkContentFetcher,
text_content_handler,
binary_content_handler,
_text_content_handler,
_binary_content_handler,
DEFAULT_USER_AGENT,
)

Expand Down Expand Up @@ -43,10 +43,10 @@ def test_init(self):
assert fetcher.retry_attempts == 2
assert fetcher.timeout == 3
assert fetcher.handlers == {
"text/html": text_content_handler,
"text/plain": text_content_handler,
"application/pdf": binary_content_handler,
"application/octet-stream": binary_content_handler,
"text/html": _text_content_handler,
"text/plain": _text_content_handler,
"application/pdf": _binary_content_handler,
"application/octet-stream": _binary_content_handler,
}
assert hasattr(fetcher, "_get_response")

Expand Down