From a3fdb72a55e1cb4395f0f9eae1624d281de6d115 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 16 May 2022 02:46:01 -0400 Subject: [PATCH] url: fix .title vs url callback plugins Also a bunch of misc cleaning --- sopel/modules/url.py | 190 +++++++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 97 deletions(-) diff --git a/sopel/modules/url.py b/sopel/modules/url.py index 69a43e71c2..6147d3ba44 100644 --- a/sopel/modules/url.py +++ b/sopel/modules/url.py @@ -10,9 +10,10 @@ """ from __future__ import annotations -import ipaddress +from ipaddress import ip_address import logging import re +from typing import TYPE_CHECKING from urllib.parse import urlparse import dns.resolver @@ -23,6 +24,12 @@ from sopel.config import types from sopel.tools import web +if TYPE_CHECKING: + from typing import Generator, List, Optional, Tuple + + from sopel.bot import Sopel, SopelWrapper + from sopel.config import Config + from sopel.trigger import Trigger LOGGER = logging.getLogger(__name__) USER_AGENT = ( @@ -39,8 +46,6 @@ # world's best way to do this, but it'll do for now. TITLE_TAG_DATA = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) QUOTED_TITLE = re.compile('[\'"][\'"]', re.IGNORECASE) -# This is another regex that presumably does something important. -RE_DCC = re.compile(r'(?i)dcc\ssend') # This sets the maximum number of bytes that should be read in order to find # the title. We don't want it too high, or a link to a big file/stream will # just keep downloading until there's no more memory. 640k ought to be enough @@ -62,13 +67,10 @@ class UrlSection(types.StaticSection): """If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters.""" enable_private_resolution = types.BooleanAttribute( 'enable_private_resolution', default=False) - """Enable URL lookups for RFC1918 addresses""" - enable_dns_resolution = types.BooleanAttribute( - 'enable_dns_resolution', default=False) - """Enable DNS resolution for all domains to validate if there are RFC1918 resolutions""" + """Enable requests to private and local network IP addresses""" -def configure(config): +def configure(config: Config): """ | name | example | purpose | | ---- | ------- | ------- | @@ -76,8 +78,7 @@ def configure(config): | exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. | | exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. | | shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. | - | enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. | - | enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. | + | enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. | """ config.define_section('url', UrlSection) config.url.configure_setting( @@ -100,15 +101,11 @@ def configure(config): ) config.url.configure_setting( 'enable_private_resolution', - 'Enable URL lookups for RFC1918 addresses?' - ) - config.url.configure_setting( - 'enable_dns_resolution', - 'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?' + 'Enable requests to private and local network IP addresses?' ) -def setup(bot): +def setup(bot: Sopel): bot.config.define_section('url', UrlSection) if bot.config.url.exclude: @@ -139,7 +136,7 @@ def setup(bot): bot.memory['shortened_urls'] = tools.SopelMemory() -def shutdown(bot): +def shutdown(bot: Sopel): # Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`; # clearing `shortened_urls` will increase API calls. Leaving it in memory # should not lead to unexpected behavior. @@ -154,7 +151,7 @@ def shutdown(bot): @plugin.example('.urlpexclude example\\.com/\\w+', user_help=True) @plugin.example('.urlexclude example.com/path', user_help=True) @plugin.output_prefix('[url] ') -def url_ban(bot, trigger): +def url_ban(bot: SopelWrapper, trigger: Trigger): """Exclude a URL from auto title. Use ``urlpexclude`` to exclude a pattern instead of a URL. @@ -199,7 +196,7 @@ def url_ban(bot, trigger): @plugin.example('.urlpallow example\\.com/\\w+', user_help=True) @plugin.example('.urlallow example.com/path', user_help=True) @plugin.output_prefix('[url] ') -def url_unban(bot, trigger): +def url_unban(bot: SopelWrapper, trigger: Trigger): """Allow a URL for auto title. Use ``urlpallow`` to allow a pattern instead of a URL. @@ -246,31 +243,28 @@ def url_unban(bot, trigger): 'Google | www.google.com', online=True, vcr=True) @plugin.output_prefix('[url] ') -def title_command(bot, trigger): +def title_command(bot: SopelWrapper, trigger: Trigger): """ Show the title or URL information for the given URL, or the last URL seen in this channel. """ + result_count = 0 + if not trigger.group(2): if trigger.sender not in bot.memory['last_seen_url']: return - matched = check_callbacks( - bot, bot.memory['last_seen_url'][trigger.sender]) - if matched: - return - else: - urls = [bot.memory['last_seen_url'][trigger.sender]] + urls = [bot.memory["last_seen_url"][trigger.sender]] else: - urls = list( # needs to be a list so len() can be checked later - web.search_urls( - trigger, - exclusion_char=bot.config.url.exclusion_char - ) - ) + # needs to be a list so len() can be checked later + urls = list(web.search_urls(trigger)) - result_count = 0 - for url, title, domain, tinyurl in process_urls(bot, trigger, urls): - message = '%s | %s' % (title, domain) + for url, title, domain, tinyurl, dispatched in process_urls( + bot, trigger, urls, requested=True + ): + if dispatched: + result_count += 1 + continue + message = "%s | %s" % (title, domain) if tinyurl: message += ' ( %s )' % tinyurl bot.reply(message) @@ -289,7 +283,7 @@ def title_command(bot, trigger): @plugin.rule(r'(?u).*(https?://\S+).*') @plugin.output_prefix('[url] ') -def title_auto(bot, trigger): +def title_auto(bot: SopelWrapper, trigger: Trigger): """ Automatically show titles for URLs. For shortened URLs/redirects, find where the URL redirects to and show the title for that (or call a function @@ -311,55 +305,68 @@ def title_auto(bot, trigger): urls = web.search_urls( trigger, exclusion_char=bot.config.url.exclusion_char, clean=True) - for url, title, domain, tinyurl in process_urls(bot, trigger, urls): - message = '%s | %s' % (title, domain) - if tinyurl: - message += ' ( %s )' % tinyurl - # Guard against responding to other instances of this bot. - if message != trigger: - bot.say(message) - bot.memory['last_seen_url'][trigger.sender] = url + for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls): + if not dispatched: + message = '%s | %s' % (title, domain) + if tinyurl: + message += ' ( %s )' % tinyurl + # Guard against responding to other instances of this bot. + if message != trigger: + bot.say(message) + bot.memory["last_seen_url"][trigger.sender] = url -def process_urls(bot, trigger, urls): +def process_urls( + bot: SopelWrapper, trigger: Trigger, urls: List[str], requested: bool = False +) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]: """ - For each URL in the list, ensure that it isn't handled by another plugin. - If not, find where it redirects to, if anywhere. If that redirected URL - should be handled by another plugin, dispatch the callback for it. - Return a list of (title, hostname) tuples for each URL which is not handled - by another plugin. + For each URL in the list, ensure it should be titled, and do so. + + :param bot: Sopel instance + :param trigger: The trigger object for this event + :param urls: The URLs detected in the triggering message + :param requested: Whether the title was explicitly requested (vs automatic) + + See if it's handled by another plugin. If not, find where it redirects to, + if anywhere. If that redirected URL should be handled by another plugin, + dispatch the callback for it. Return a list of + (url, title, hostname, tinyurl, dispatched) tuples for each URL. + + If a callback was dispatched, only the url and dispatched=True will be set. + + For titles explicitly requested by the user, exclusion_char and excludes + are skipped. """ shorten_url_length = bot.config.url.shorten_url_length for url in urls: # Exclude URLs that start with the exclusion char - if url.startswith(bot.config.url.exclusion_char): + if not requested and url.startswith(bot.config.url.exclusion_char): continue + parsed_url = urlparse(url) + # Check the URL does not match an existing URL callback - if check_callbacks(bot, url): - continue + if check_callbacks(bot, url, use_excludes=not requested): + yield (url, None, None, None, True) + return # Prevent private addresses from being queried if enable_private_resolution is False + # FIXME: This does nothing when an attacker knows how to host a 302 + # FIXME: This whole concept has a TOCTOU issue if not bot.config.url.enable_private_resolution: - parsed = urlparse(url) - # Check if it's an address like http://192.168.1.1 try: - if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: - LOGGER.debug('Ignoring private URL: %s', url) - continue + ips = [ip_address(parsed_url.hostname)] except ValueError: - pass - - # Check if domains are RFC1918 addresses if enable_dns_resolutions is set - if bot.config.url.enable_dns_resolution: - private = False - for result in dns.resolver.query(parsed.hostname): - if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: - private = True - break - if private: - LOGGER.debug('Ignoring private URL: %s', url) - continue + ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)] + + private = False + for ip in ips: + if ip.is_private or ip.is_loopback: + private = True + break + if private: + LOGGER.debug("Ignoring private URL: %s", url) + continue # Call the URL to get a title, if possible title = find_title(url) @@ -373,14 +380,15 @@ def process_urls(bot, trigger, urls): if (shorten_url_length > 0) and (len(url) > shorten_url_length): tinyurl = get_or_create_shorturl(bot, url) - yield (url, title, get_hostname(url), tinyurl) + yield (url, title, parsed_url.hostname, tinyurl, False) -def check_callbacks(bot, url): +def check_callbacks(bot: SopelWrapper, url: str, use_excludes: bool = True) -> bool: """Check if ``url`` is excluded or matches any URL callback patterns. :param bot: Sopel instance - :param str url: URL to check + :param url: URL to check + :param use_excludes: Use or ignore the configured exclusion lists :return: True if ``url`` is excluded or matches any URL callback pattern This function looks at the ``bot.memory`` for ``url_exclude`` patterns and @@ -400,16 +408,21 @@ def check_callbacks(bot, url): """ # Check if it matches the exclusion list first - matched = any(regex.search(url) for regex in bot.memory['url_exclude']) + excluded = use_excludes and any( + regex.search(url) for regex in bot.memory["url_exclude"] + ) return ( - matched or + excluded or any(bot.search_url_callbacks(url)) or bot.rules.check_url_callback(bot, url) ) -def find_title(url, verify=True): - """Return the title for the given URL.""" +def find_title(url: str, verify: bool = True) -> Optional[str]: + """Return the title for the given URL. + + :param verify: Whether to require a valid certificate when using https + """ try: response = requests.get(url, stream=True, verify=verify, headers=DEFAULT_HEADERS) @@ -447,32 +460,15 @@ def find_title(url, verify=True): title = ' '.join(title.split()) # cleanly remove multiple spaces - # More cryptic regex substitutions. This one looks to be myano's invention. - title = RE_DCC.sub('', title) - return title or None -def get_hostname(url): - idx = 7 - if url.startswith('https://'): - idx = 8 - elif url.startswith('ftp://'): - idx = 6 - hostname = url[idx:] - slash = hostname.find('/') - if slash != -1: - hostname = hostname[:slash] - return hostname - - -def get_or_create_shorturl(bot, url): +def get_or_create_shorturl(bot: SopelWrapper, url: str) -> str: """Get or create a short URL for ``url`` :param bot: Sopel instance - :param str url: URL to get or create a short URL for + :param url: URL to get or create a short URL for :return: A short URL - :rtype: str It gets the short URL for ``url`` from the bot's memory if it exists. Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it @@ -488,7 +484,7 @@ def get_or_create_shorturl(bot, url): return tinyurl -def get_tinyurl(url): +def get_tinyurl(url: str) -> Optional[str]: """Returns a shortened tinyURL link of the URL""" base_url = "https://tinyurl.com/api-create.php" tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))