diff --git a/sopel/builtins/url.py b/sopel/builtins/url.py index 266538c4c..3e8362141 100644 --- a/sopel/builtins/url.py +++ b/sopel/builtins/url.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from sopel.bot import Sopel, SopelWrapper from sopel.config import Config from sopel.trigger import Trigger @@ -426,10 +427,16 @@ def process_urls( continue # Call the URL to get a title, if possible + unsafe_urls = [ + url + for url, data in bot.memory.get("safety_cache", {}).items() + if data.get("positives") + ] title_results = find_title( url, allow_local=bot.config.url.enable_private_resolution, - memory=bot.memory, + unsafe_urls=unsafe_urls, + unsafe_domains=bot.memory.get("safety_cache_local", {}).values(), ) if not title_results: # No title found: don't handle this URL @@ -484,13 +491,15 @@ def find_title( url: str, verify: bool = True, allow_local: bool = False, - memory: tools.SopelMemory = {}, -) -> Optional[Tuple[str, str]]: + unsafe_urls: list = [], + unsafe_domains: list = [], +) -> Optional[tuple[str, str]]: """Fetch the title for the given URL. :param verify: Whether to require a valid certificate when using https :param allow_local: Allow requests to non-global addresses (RFC1918, etc.) - :param memory: The bot.memory to search for safety.py caches + :param unsafe_urls: A list of URLs to consider malicious and ignore + :param unsafe_domains: A list of domains to consider malicious and ignore :return: A tuple of the (title, final_hostname) that were found, or None """ original_url = url @@ -501,14 +510,14 @@ def find_title( while redirects_left > 0: redirects_left -= 1 parsed_url = urlparse(url) + if not parsed_url.hostname: + return None # Avoid fetching known malicious links - safety_cache = bot.memory.get("safety_cache", {}) - safety_cache_local = bot.memory.get("safety_cache_local", {}) - if url in safety_cache and safety_cache[url]["positives"] > 0: - LOGGER.debug("Ignoring unsafe URL: %r", url) - return None - if parsed_url.hostname.lower() in safety_cache_local: + if url in unsafe_urls: + LOGGER.debug("Ignoring unsafe URL: %r", url) + return None + if parsed_url.hostname.lower() in unsafe_domains: LOGGER.debug("Ignoring unsafe domain: %r", url) return None @@ -548,9 +557,9 @@ def find_title( LOGGER.debug( "URL %r redirected to %r", url, response.headers.get("Location") ) - url = response.headers.get("Location") - if url is None: + if "Location" not in response.headers: return None + url = response.headers["Location"] continue content_bytes = b''