Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

url: fix .title vs url callback plugins #2282

Merged
merged 1 commit into from
Jun 3, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 93 additions & 97 deletions sopel/modules/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
"""
from __future__ import annotations

import ipaddress
from ipaddress import ip_address
import logging
import re
from typing import TYPE_CHECKING
from urllib.parse import urlparse

import dns.resolver
Expand All @@ -23,6 +24,12 @@
from sopel.config import types
from sopel.tools import web

if TYPE_CHECKING:
from typing import Generator, List, Optional, Tuple

from sopel.bot import Sopel, SopelWrapper
from sopel.config import Config
from sopel.trigger import Trigger

LOGGER = logging.getLogger(__name__)
USER_AGENT = (
Expand All @@ -39,8 +46,6 @@
# world's best way to do this, but it'll do for now.
TITLE_TAG_DATA = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
QUOTED_TITLE = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
# This is another regex that presumably does something important.
RE_DCC = re.compile(r'(?i)dcc\ssend')
# This sets the maximum number of bytes that should be read in order to find
# the title. We don't want it too high, or a link to a big file/stream will
# just keep downloading until there's no more memory. 640k ought to be enough
Expand All @@ -62,22 +67,18 @@ class UrlSection(types.StaticSection):
"""If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
enable_private_resolution = types.BooleanAttribute(
'enable_private_resolution', default=False)
"""Enable URL lookups for RFC1918 addresses"""
enable_dns_resolution = types.BooleanAttribute(
'enable_dns_resolution', default=False)
"""Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
"""Enable requests to private and local network IP addresses"""


def configure(config):
def configure(config: Config):
"""
| name | example | purpose |
| ---- | ------- | ------- |
| enable_auto_title | yes | Enable auto-title. |
| exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
| exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
| shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
| enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. |
| enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
| enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. |
"""
config.define_section('url', UrlSection)
config.url.configure_setting(
Expand All @@ -100,15 +101,11 @@ def configure(config):
)
config.url.configure_setting(
'enable_private_resolution',
'Enable URL lookups for RFC1918 addresses?'
)
config.url.configure_setting(
'enable_dns_resolution',
'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
'Enable requests to private and local network IP addresses?'
)


def setup(bot):
def setup(bot: Sopel):
bot.config.define_section('url', UrlSection)

if bot.config.url.exclude:
Expand Down Expand Up @@ -139,7 +136,7 @@ def setup(bot):
bot.memory['shortened_urls'] = tools.SopelMemory()


def shutdown(bot):
def shutdown(bot: Sopel):
# Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
# clearing `shortened_urls` will increase API calls. Leaving it in memory
# should not lead to unexpected behavior.
Expand All @@ -154,7 +151,7 @@ def shutdown(bot):
@plugin.example('.urlpexclude example\\.com/\\w+', user_help=True)
@plugin.example('.urlexclude example.com/path', user_help=True)
@plugin.output_prefix('[url] ')
def url_ban(bot, trigger):
def url_ban(bot: SopelWrapper, trigger: Trigger):
"""Exclude a URL from auto title.

Use ``urlpexclude`` to exclude a pattern instead of a URL.
Expand Down Expand Up @@ -199,7 +196,7 @@ def url_ban(bot, trigger):
@plugin.example('.urlpallow example\\.com/\\w+', user_help=True)
@plugin.example('.urlallow example.com/path', user_help=True)
@plugin.output_prefix('[url] ')
def url_unban(bot, trigger):
def url_unban(bot: SopelWrapper, trigger: Trigger):
"""Allow a URL for auto title.

Use ``urlpallow`` to allow a pattern instead of a URL.
Expand Down Expand Up @@ -246,31 +243,28 @@ def url_unban(bot, trigger):
'Google | www.google.com',
online=True, vcr=True)
@plugin.output_prefix('[url] ')
def title_command(bot, trigger):
def title_command(bot: SopelWrapper, trigger: Trigger):
"""
Show the title or URL information for the given URL, or the last URL seen
in this channel.
"""
result_count = 0

if not trigger.group(2):
if trigger.sender not in bot.memory['last_seen_url']:
return
matched = check_callbacks(
bot, bot.memory['last_seen_url'][trigger.sender])
if matched:
return
else:
urls = [bot.memory['last_seen_url'][trigger.sender]]
urls = [bot.memory["last_seen_url"][trigger.sender]]
else:
urls = list( # needs to be a list so len() can be checked later
web.search_urls(
trigger,
exclusion_char=bot.config.url.exclusion_char
)
)
# needs to be a list so len() can be checked later
urls = list(web.search_urls(trigger))

result_count = 0
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
message = '%s | %s' % (title, domain)
for url, title, domain, tinyurl, dispatched in process_urls(
bot, trigger, urls, requested=True
):
if dispatched:
result_count += 1
continue
message = "%s | %s" % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
bot.reply(message)
Expand All @@ -289,7 +283,7 @@ def title_command(bot, trigger):

@plugin.rule(r'(?u).*(https?://\S+).*')
@plugin.output_prefix('[url] ')
def title_auto(bot, trigger):
def title_auto(bot: SopelWrapper, trigger: Trigger):
"""
Automatically show titles for URLs. For shortened URLs/redirects, find
where the URL redirects to and show the title for that (or call a function
Expand All @@ -311,55 +305,68 @@ def title_auto(bot, trigger):
urls = web.search_urls(
trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)

for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
message = '%s | %s' % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
# Guard against responding to other instances of this bot.
if message != trigger:
bot.say(message)
bot.memory['last_seen_url'][trigger.sender] = url
for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls):
if not dispatched:
message = '%s | %s' % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
# Guard against responding to other instances of this bot.
if message != trigger:
bot.say(message)
bot.memory["last_seen_url"][trigger.sender] = url


def process_urls(bot, trigger, urls):
def process_urls(
bot: SopelWrapper, trigger: Trigger, urls: List[str], requested: bool = False
) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]:
"""
For each URL in the list, ensure that it isn't handled by another plugin.
If not, find where it redirects to, if anywhere. If that redirected URL
should be handled by another plugin, dispatch the callback for it.
Return a list of (title, hostname) tuples for each URL which is not handled
by another plugin.
For each URL in the list, ensure it should be titled, and do so.

:param bot: Sopel instance
:param trigger: The trigger object for this event
:param urls: The URLs detected in the triggering message
:param requested: Whether the title was explicitly requested (vs automatic)

See if it's handled by another plugin. If not, find where it redirects to,
if anywhere. If that redirected URL should be handled by another plugin,
dispatch the callback for it. Return a list of
(url, title, hostname, tinyurl, dispatched) tuples for each URL.

If a callback was dispatched, only the url and dispatched=True will be set.

For titles explicitly requested by the user, exclusion_char and excludes
are skipped.
"""
shorten_url_length = bot.config.url.shorten_url_length
for url in urls:
# Exclude URLs that start with the exclusion char
if url.startswith(bot.config.url.exclusion_char):
if not requested and url.startswith(bot.config.url.exclusion_char):
continue

parsed_url = urlparse(url)

# Check the URL does not match an existing URL callback
if check_callbacks(bot, url):
continue
if check_callbacks(bot, url, use_excludes=not requested):
yield (url, None, None, None, True)
return

# Prevent private addresses from being queried if enable_private_resolution is False
# FIXME: This does nothing when an attacker knows how to host a 302
# FIXME: This whole concept has a TOCTOU issue
if not bot.config.url.enable_private_resolution:
parsed = urlparse(url)
# Check if it's an address like http://192.168.1.1
try:
if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
LOGGER.debug('Ignoring private URL: %s', url)
continue
ips = [ip_address(parsed_url.hostname)]
except ValueError:
pass

# Check if domains are RFC1918 addresses if enable_dns_resolutions is set
if bot.config.url.enable_dns_resolution:
private = False
for result in dns.resolver.query(parsed.hostname):
if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
private = True
break
if private:
LOGGER.debug('Ignoring private URL: %s', url)
continue
ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)]

private = False
for ip in ips:
if ip.is_private or ip.is_loopback:
private = True
break
if private:
LOGGER.debug("Ignoring private URL: %s", url)
continue

# Call the URL to get a title, if possible
title = find_title(url)
Expand All @@ -373,14 +380,15 @@ def process_urls(bot, trigger, urls):
if (shorten_url_length > 0) and (len(url) > shorten_url_length):
tinyurl = get_or_create_shorturl(bot, url)

yield (url, title, get_hostname(url), tinyurl)
yield (url, title, parsed_url.hostname, tinyurl, False)


def check_callbacks(bot, url):
def check_callbacks(bot: SopelWrapper, url: str, use_excludes: bool = True) -> bool:
"""Check if ``url`` is excluded or matches any URL callback patterns.

:param bot: Sopel instance
:param str url: URL to check
:param url: URL to check
:param use_excludes: Use or ignore the configured exclusion lists
:return: True if ``url`` is excluded or matches any URL callback pattern

This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
Expand All @@ -400,16 +408,21 @@ def check_callbacks(bot, url):

"""
# Check if it matches the exclusion list first
matched = any(regex.search(url) for regex in bot.memory['url_exclude'])
excluded = use_excludes and any(
regex.search(url) for regex in bot.memory["url_exclude"]
)
return (
matched or
excluded or
any(bot.search_url_callbacks(url)) or
bot.rules.check_url_callback(bot, url)
)


def find_title(url, verify=True):
"""Return the title for the given URL."""
def find_title(url: str, verify: bool = True) -> Optional[str]:
"""Return the title for the given URL.

:param verify: Whether to require a valid certificate when using https
"""
try:
response = requests.get(url, stream=True, verify=verify,
headers=DEFAULT_HEADERS)
Expand Down Expand Up @@ -447,32 +460,15 @@ def find_title(url, verify=True):

title = ' '.join(title.split()) # cleanly remove multiple spaces

# More cryptic regex substitutions. This one looks to be myano's invention.
title = RE_DCC.sub('', title)

return title or None


def get_hostname(url):
half-duplex marked this conversation as resolved.
Show resolved Hide resolved
idx = 7
if url.startswith('https://'):
idx = 8
elif url.startswith('ftp://'):
idx = 6
hostname = url[idx:]
slash = hostname.find('/')
if slash != -1:
hostname = hostname[:slash]
return hostname


def get_or_create_shorturl(bot, url):
def get_or_create_shorturl(bot: SopelWrapper, url: str) -> str:
"""Get or create a short URL for ``url``

:param bot: Sopel instance
:param str url: URL to get or create a short URL for
:param url: URL to get or create a short URL for
:return: A short URL
:rtype: str

It gets the short URL for ``url`` from the bot's memory if it exists.
Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
Expand All @@ -488,7 +484,7 @@ def get_or_create_shorturl(bot, url):
return tinyurl


def get_tinyurl(url):
def get_tinyurl(url: str) -> Optional[str]:
"""Returns a shortened tinyURL link of the URL"""
base_url = "https://tinyurl.com/api-create.php"
tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))
Expand Down