diff --git a/sopel/modules/url.py b/sopel/modules/url.py index 69a43e71c2..92126cfe13 100644 --- a/sopel/modules/url.py +++ b/sopel/modules/url.py @@ -10,9 +10,10 @@ """ from __future__ import annotations -import ipaddress +from ipaddress import ip_address import logging import re +from typing import Generator, List, Optional, Tuple from urllib.parse import urlparse import dns.resolver @@ -20,8 +21,10 @@ from urllib3.exceptions import LocationValueError # type: ignore[import] from sopel import plugin, tools -from sopel.config import types +from sopel.bot import Sopel, SopelWrapper +from sopel.config import Config, types from sopel.tools import web +from sopel.trigger import Trigger LOGGER = logging.getLogger(__name__) @@ -62,13 +65,10 @@ class UrlSection(types.StaticSection): """If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters.""" enable_private_resolution = types.BooleanAttribute( 'enable_private_resolution', default=False) - """Enable URL lookups for RFC1918 addresses""" - enable_dns_resolution = types.BooleanAttribute( - 'enable_dns_resolution', default=False) - """Enable DNS resolution for all domains to validate if there are RFC1918 resolutions""" + """Enable requests to private and local network IP addresses""" -def configure(config): +def configure(config: Config): """ | name | example | purpose | | ---- | ------- | ------- | @@ -76,8 +76,7 @@ def configure(config): | exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. | | exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. | | shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. | - | enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. | - | enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. | + | enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. | """ config.define_section('url', UrlSection) config.url.configure_setting( @@ -100,15 +99,11 @@ def configure(config): ) config.url.configure_setting( 'enable_private_resolution', - 'Enable URL lookups for RFC1918 addresses?' - ) - config.url.configure_setting( - 'enable_dns_resolution', - 'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?' + 'Enable requests to private and local network IP addresses?' ) -def setup(bot): +def setup(bot: Sopel): bot.config.define_section('url', UrlSection) if bot.config.url.exclude: @@ -139,7 +134,7 @@ def setup(bot): bot.memory['shortened_urls'] = tools.SopelMemory() -def shutdown(bot): +def shutdown(bot: Sopel): # Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`; # clearing `shortened_urls` will increase API calls. Leaving it in memory # should not lead to unexpected behavior. @@ -154,7 +149,7 @@ def shutdown(bot): @plugin.example('.urlpexclude example\\.com/\\w+', user_help=True) @plugin.example('.urlexclude example.com/path', user_help=True) @plugin.output_prefix('[url] ') -def url_ban(bot, trigger): +def url_ban(bot: SopelWrapper, trigger: Trigger): """Exclude a URL from auto title. Use ``urlpexclude`` to exclude a pattern instead of a URL. @@ -199,7 +194,7 @@ def url_ban(bot, trigger): @plugin.example('.urlpallow example\\.com/\\w+', user_help=True) @plugin.example('.urlallow example.com/path', user_help=True) @plugin.output_prefix('[url] ') -def url_unban(bot, trigger): +def url_unban(bot: SopelWrapper, trigger: Trigger): """Allow a URL for auto title. Use ``urlpallow`` to allow a pattern instead of a URL. @@ -246,30 +241,27 @@ def url_unban(bot, trigger): 'Google | www.google.com', online=True, vcr=True) @plugin.output_prefix('[url] ') -def title_command(bot, trigger): +def title_command(bot: SopelWrapper, trigger: Trigger): """ Show the title or URL information for the given URL, or the last URL seen in this channel. """ + result_count = 0 + if not trigger.group(2): if trigger.sender not in bot.memory['last_seen_url']: return - matched = check_callbacks( - bot, bot.memory['last_seen_url'][trigger.sender]) - if matched: - return - else: - urls = [bot.memory['last_seen_url'][trigger.sender]] + urls = [bot.memory["last_seen_url"][trigger.sender]] else: - urls = list( # needs to be a list so len() can be checked later - web.search_urls( - trigger, - exclusion_char=bot.config.url.exclusion_char - ) - ) + # needs to be a list so len() can be checked later + urls = list(web.search_urls(trigger)) - result_count = 0 - for url, title, domain, tinyurl in process_urls(bot, trigger, urls): + for url, title, domain, tinyurl, dispatched in process_urls( + bot, trigger, urls, requested=True + ): + if dispatched: + result_count += 1 + continue message = '%s | %s' % (title, domain) if tinyurl: message += ' ( %s )' % tinyurl @@ -289,7 +281,7 @@ def title_command(bot, trigger): @plugin.rule(r'(?u).*(https?://\S+).*') @plugin.output_prefix('[url] ') -def title_auto(bot, trigger): +def title_auto(bot: SopelWrapper, trigger: Trigger): """ Automatically show titles for URLs. For shortened URLs/redirects, find where the URL redirects to and show the title for that (or call a function @@ -311,55 +303,68 @@ def title_auto(bot, trigger): urls = web.search_urls( trigger, exclusion_char=bot.config.url.exclusion_char, clean=True) - for url, title, domain, tinyurl in process_urls(bot, trigger, urls): - message = '%s | %s' % (title, domain) - if tinyurl: - message += ' ( %s )' % tinyurl - # Guard against responding to other instances of this bot. - if message != trigger: - bot.say(message) - bot.memory['last_seen_url'][trigger.sender] = url + for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls): + if not dispatched: + message = '%s | %s' % (title, domain) + if tinyurl: + message += ' ( %s )' % tinyurl + # Guard against responding to other instances of this bot. + if message != trigger: + bot.say(message) + bot.memory["last_seen_url"][trigger.sender] = url -def process_urls(bot, trigger, urls): +def process_urls( + bot: SopelWrapper, trigger: Trigger, urls: List[str], requested: bool = False +) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]: """ - For each URL in the list, ensure that it isn't handled by another plugin. - If not, find where it redirects to, if anywhere. If that redirected URL - should be handled by another plugin, dispatch the callback for it. - Return a list of (title, hostname) tuples for each URL which is not handled - by another plugin. + For each URL in the list, ensure it should be titled, and do so. + + See if it's handled by another plugin. If not, find where it redirects to, + if anywhere. If that redirected URL should be handled by another plugin, + dispatch the callback for it. Return a list of + (url, title, hostname, tinyurl, dispatched) tuples for each URL. + + If a callback was dispatched, only the url and dispatched=True will be set. + + For titles explicitly requested by the user, exclusion_char and excludes + are skipped. + + :param bot: Sopel instance + :param trigger: The trigger object for this event + :param urls: The URLs detected in the triggering message + :param requested: Whether the title was explicitly requested (vs automatic) """ shorten_url_length = bot.config.url.shorten_url_length for url in urls: # Exclude URLs that start with the exclusion char - if url.startswith(bot.config.url.exclusion_char): + if not requested and url.startswith(bot.config.url.exclusion_char): continue + parsed_url = urlparse(url) + # Check the URL does not match an existing URL callback - if check_callbacks(bot, url): - continue + if check_callbacks(bot, url, use_excludes=not requested): + yield (url, None, None, None, True) + return # Prevent private addresses from being queried if enable_private_resolution is False + # FIXME: This does nothing when an attacker knows how to host a 302 + # FIXME: This whole concept has a TOCTOU issue if not bot.config.url.enable_private_resolution: - parsed = urlparse(url) - # Check if it's an address like http://192.168.1.1 try: - if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: - LOGGER.debug('Ignoring private URL: %s', url) - continue + ips = [ip_address(parsed_url.hostname)] except ValueError: - pass - - # Check if domains are RFC1918 addresses if enable_dns_resolutions is set - if bot.config.url.enable_dns_resolution: - private = False - for result in dns.resolver.query(parsed.hostname): - if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: - private = True - break - if private: - LOGGER.debug('Ignoring private URL: %s', url) - continue + ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)] + + private = False + for ip in ips: + if ip.is_private or ip.is_loopback: + private = True + break + if private: + LOGGER.debug('Ignoring private URL: %s', url) + continue # Call the URL to get a title, if possible title = find_title(url) @@ -373,14 +378,15 @@ def process_urls(bot, trigger, urls): if (shorten_url_length > 0) and (len(url) > shorten_url_length): tinyurl = get_or_create_shorturl(bot, url) - yield (url, title, get_hostname(url), tinyurl) + yield (url, title, parsed_url.hostname, tinyurl, False) -def check_callbacks(bot, url): +def check_callbacks(bot: SopelWrapper, url: str, use_excludes: bool = True) -> bool: """Check if ``url`` is excluded or matches any URL callback patterns. :param bot: Sopel instance - :param str url: URL to check + :param url: URL to check + :param use_excludes: Use or ignore the configured exclusion lists :return: True if ``url`` is excluded or matches any URL callback pattern This function looks at the ``bot.memory`` for ``url_exclude`` patterns and @@ -400,16 +406,21 @@ def check_callbacks(bot, url): """ # Check if it matches the exclusion list first - matched = any(regex.search(url) for regex in bot.memory['url_exclude']) + excluded = False + if use_excludes: + excluded = any(regex.search(url) for regex in bot.memory["url_exclude"]) return ( - matched or + excluded or any(bot.search_url_callbacks(url)) or bot.rules.check_url_callback(bot, url) ) -def find_title(url, verify=True): - """Return the title for the given URL.""" +def find_title(url: str, verify: bool = True) -> Optional[str]: + """Return the title for the given URL. + + :param verify: Whether to require a valid certificate when using https + """ try: response = requests.get(url, stream=True, verify=verify, headers=DEFAULT_HEADERS) @@ -453,26 +464,12 @@ def find_title(url, verify=True): return title or None -def get_hostname(url): - idx = 7 - if url.startswith('https://'): - idx = 8 - elif url.startswith('ftp://'): - idx = 6 - hostname = url[idx:] - slash = hostname.find('/') - if slash != -1: - hostname = hostname[:slash] - return hostname - - -def get_or_create_shorturl(bot, url): +def get_or_create_shorturl(bot: SopelWrapper, url: str) -> str: """Get or create a short URL for ``url`` :param bot: Sopel instance - :param str url: URL to get or create a short URL for + :param url: URL to get or create a short URL for :return: A short URL - :rtype: str It gets the short URL for ``url`` from the bot's memory if it exists. Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it @@ -488,7 +485,7 @@ def get_or_create_shorturl(bot, url): return tinyurl -def get_tinyurl(url): +def get_tinyurl(url: str) -> Optional[str]: """Returns a shortened tinyURL link of the URL""" base_url = "https://tinyurl.com/api-create.php" tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))