diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index 7d531694cd..3f437b84f3 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -25,7 +25,7 @@ } -def text_content_handler(response: Response) -> ByteStream: +def _text_content_handler(response: Response) -> ByteStream: """ :param response: Response object from the request. :return: The extracted text. @@ -33,7 +33,7 @@ def text_content_handler(response: Response) -> ByteStream: return ByteStream.from_string(response.text) -def binary_content_handler(response: Response) -> ByteStream: +def _binary_content_handler(response: Response) -> ByteStream: """ :param response: Response object from the request. :return: The extracted binary file-like object. @@ -44,8 +44,22 @@ def binary_content_handler(response: Response) -> ByteStream: @component class LinkContentFetcher: """ - LinkContentFetcher is a component for fetching and extracting content from URLs. It supports handling various - content types, retries on failures, and automatic user-agent rotation for failed web requests. + LinkContentFetcher is a component for fetching and extracting content from URLs. + + It supports handling various content types, retries on failures, and automatic user-agent rotation for failed web + requests. + + Usage example: + ```python + from haystack.components.fetchers.link_content import LinkContentFetcher + + fetcher = LinkContentFetcher() + streams = fetcher.run(urls=["https://www.google.com"])["streams"] + + assert len(streams) == 1 + assert streams[0].meta == {'content_type': 'text/html', 'url': 'https://www.google.com'} + assert streams[0].data + ``` """ def __init__( @@ -56,13 +70,14 @@ def __init__( timeout: int = 3, ): """ - Initializes a LinkContentFetcher instance. - - :param raise_on_failure: If True, raises an exception if it fails to fetch a single URL. - For multiple URLs, it logs errors and returns the content it successfully fetched. Default is True. - :param user_agents: A list of user agents for fetching content. If None, a default user agent is used. - :param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. Default is 2. - :param timeout: Timeout in seconds for the request. Default is 3. + Initializes the component. + + :param raise_on_failure: If `True`, raises an exception if it fails to fetch a single URL. + For multiple URLs, it logs errors and returns the content it successfully fetched. + :param user_agents: [User agents](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) + for fetching content. If `None`, a default user agent is used. + :param retry_attempts: Specifies how many times you want it to retry to fetch the URL's content. + :param timeout: Timeout in seconds for the request. """ self.raise_on_failure = raise_on_failure self.user_agents = user_agents or [DEFAULT_USER_AGENT] @@ -71,11 +86,11 @@ def __init__( self.timeout = timeout # register default content handlers that extract data from the response - self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: text_content_handler) - self.handlers["text/html"] = text_content_handler - self.handlers["text/plain"] = text_content_handler - self.handlers["application/pdf"] = binary_content_handler - self.handlers["application/octet-stream"] = binary_content_handler + self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler) + self.handlers["text/html"] = _text_content_handler + self.handlers["text/plain"] = _text_content_handler + self.handlers["application/pdf"] = _binary_content_handler + self.handlers["application/octet-stream"] = _binary_content_handler @retry( reraise=True, @@ -99,17 +114,19 @@ def get_response(url): def run(self, urls: List[str]): """ Fetches content from a list of URLs and returns a list of extracted content streams. - Each content stream is a ByteStream object containing the extracted content as binary data. + + Each content stream is a `ByteStream` object containing the extracted content as binary data. Each ByteStream object in the returned list corresponds to the contents of a single URL. The content type of each stream is stored in the metadata of the ByteStream object under the key "content_type". The URL of the fetched content is stored under the key "url". :param urls: A list of URLs to fetch content from. - :return: A lists of ByteStream objects representing the extracted content. + :return: `ByteStream` objects representing the extracted content. - :raises: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to True, - an exception will be raised in case of an error during content retrieval. In all other scenarios, any - retrieval errors are logged, and a list of successfully retrieved ByteStream objects is returned. + :raises Exception: If the provided list of URLs contains only a single URL, and `raise_on_failure` is set to + `True`, an exception will be raised in case of an error during content retrieval. + In all other scenarios, any retrieval errors are logged, and a list of successfully retrieved `ByteStream` + objects is returned. """ streams: List[ByteStream] = [] if not urls: @@ -117,7 +134,7 @@ def run(self, urls: List[str]): # don't use multithreading if there's only one URL if len(urls) == 1: - stream_metadata, stream = self.fetch(urls[0]) + stream_metadata, stream = self._fetch(urls[0]) stream.meta.update(stream_metadata) streams.append(stream) else: @@ -131,7 +148,7 @@ def run(self, urls: List[str]): return {"streams": streams} - def fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]: + def _fetch(self, url: str) -> Tuple[Dict[str, str], ByteStream]: """ Fetches content from a URL and returns it as a ByteStream. @@ -175,12 +192,12 @@ def _fetch_with_exception_suppression(self, url: str) -> Tuple[Optional[Dict[str """ if self.raise_on_failure: try: - return self.fetch(url) + return self._fetch(url) except Exception as e: logger.warning("Error fetching %s: %s", url, str(e)) return {"content_type": "Unknown", "url": url}, None else: - return self.fetch(url) + return self._fetch(url) def _get_content_type(self, response: Response): """ diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index e3f350d5a1..7b9aae01fb 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -5,8 +5,8 @@ from haystack.components.fetchers.link_content import ( LinkContentFetcher, - text_content_handler, - binary_content_handler, + _text_content_handler, + _binary_content_handler, DEFAULT_USER_AGENT, ) @@ -43,10 +43,10 @@ def test_init(self): assert fetcher.retry_attempts == 2 assert fetcher.timeout == 3 assert fetcher.handlers == { - "text/html": text_content_handler, - "text/plain": text_content_handler, - "application/pdf": binary_content_handler, - "application/octet-stream": binary_content_handler, + "text/html": _text_content_handler, + "text/plain": _text_content_handler, + "application/pdf": _binary_content_handler, + "application/octet-stream": _binary_content_handler, } assert hasattr(fetcher, "_get_response")