diff --git a/willie/modules/url.py b/willie/modules/url.py
index aaa683dd1f..748c9fc35b 100644
--- a/willie/modules/url.py
+++ b/willie/modules/url.py
@@ -1,7 +1,7 @@
"""
url.py - Willie URL title module
Copyright 2010-2011, Michael Yanovich, yanovich.net, Kenneth Sham
-Copyright 2012 Edward Powell
+Copyright 2012-2013 Edward Powell
Licensed under the Eiffel Forum License 2.
http://willie.dftba.net
@@ -10,17 +10,24 @@
import re
from htmlentitydefs import name2codepoint
import willie.web as web
+import urllib2
import unicodedata
import urlparse
url_finder = None
r_entity = re.compile(r'&[A-Za-z0-9#]+;')
-INVALID_WEBSITE = 0x01
exclusion_char = '!'
+# These are used to clean up the title tag before actually parsing it. Not the
+# world's best way to do this, but it'll do for now.
+title_tag_data = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
+quoted_title = re.compile('[\'"]
[\'"]', re.IGNORECASE)
+# This is another regex that presumably does something important.
+re_dcc = re.compile(r'(?i)dcc\ssend')
+
def configure(config):
"""
-
+
| [url] | example | purpose |
| ---- | ------- | ------- |
| exclude | https?://git\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
@@ -29,11 +36,12 @@ def configure(config):
if config.option('Exclude certain URLs from automatic title display', False):
if not config.has_section('url'):
config.add_section('url')
- config.add_list('url', 'exclude', 'Enter regular expressions for each URL you would like to exclude.',
+ config.add_list('url', 'exclude', 'Enter regular expressions for each URL you would like to exclude.',
'Regex:')
config.interactive_add('url', 'exclusion_char',
'Prefix to suppress URL titling', '!')
-
+
+
def setup(willie):
global url_finder, exclusion_char
if willie.config.has_option('url', 'exclude'):
@@ -41,157 +49,191 @@ def setup(willie):
willie.config.url.get_list(exclude)]
else:
regexes = []
-
+
+ # We're keeping these in their own list, rather than putting then in the
+ # callbacks list because 1, it's easier to deal with modules that are still
+ # using this list, and not the newer callbacks list and 2, having a lambda
+ # just to pass is kinda ugly.
if not willie.memory.contains('url_exclude'):
willie.memory['url_exclude'] = regexes
else:
exclude = willie.memory['url_exclude']
- if regexes: exclude.append(regexes)
- willie.memory['url_exclude'] = exclude
-
+ if regexes:
+ exclude.append(regexes)
+ willie.memory['url_exclude'] = regexes
+
+ # Ensure that url_callbacks and last_seen_url are in memory
+ if not willie.memory.contains('url_callbacks'):
+ willie.memory['url_callbacks'] = {}
+ if not willie.memory.contains('last_seen_url'):
+ willie.memory['last_seen_url'] = {}
+
if willie.config.has_option('url', 'exclusion_char'):
exclusion_char = willie.config.url.exclusion_char
-
- url_finder = re.compile(r'(?u)(%s?(http|https|ftp)(://\S+))' %
+
+ url_finder = re.compile(r'(?u)(%s?(?:http|https|ftp)(?:://\S+))' %
(exclusion_char))
- # We want the exclusion list to be pre-compiled, since url parsing gets
- # called a /lot/, and it's annoying when it's laggy.
-def find_title(url):
+
+def title_command(willie, trigger):
+ """
+ Show the title or URL information for the given URL, or the last URL seen
+ in this channel.
+ """
+ if not trigger.group(2):
+ if trigger.sender not in willie.memory['last_seen_url']:
+ return
+ matched = check_callbacks(willie, trigger,
+ willie.memory['last_seen_url'][trigger.sender],
+ True)
+ if not matched:
+ urls = [willie.memory['last_seen_url'][trigger.sender]]
+ else:
+ urls = re.findall(url_finder, trigger)
+
+ results = process_urls(willie, trigger, urls)
+title_command.commands = ['title']
+
+
+def title_auto(willie, trigger):
+ """
+ Automatically show titles for URLs. For shortened URLs/redirects, find
+ where the URL redirects to and show the title for that (or call a function
+ from another module to give more information).
+ """
+ if re.match(willie.config.core.prefix + 'title', trigger):
+ return
+
+ urls = re.findall(url_finder, trigger)
+ results = process_urls(willie, trigger, urls)
+ willie.memory['last_seen_url'][trigger.sender] = urls[-1]
+
+ for result in results[:4]:
+ message = '[ %s ] - %s' % tuple(result)
+ if message != trigger:
+ willie.say(message)
+title_auto.rule = '(?u).*(https?://\S+).*'
+
+
+def process_urls(willie, trigger, urls):
+ """
+ For each URL in the list, ensure that it isn't handled by another module.
+ If not, find where it redirects to, if anywhere. If that redirected URL
+ should be handled by another module, dispatch the callback for it.
+ Return a list of (title, TLD) tuples for each URL which is not handled by
+ another module.
+ """
+
+ results = []
+ for url in urls:
+ if not url.startswith(exclusion_char):
+ # Magic stuff to account for international domain names
+ url = uni_encode(url)
+ url = uni_decode(url)
+ url = iri_to_uri(url)
+ # First, check that the URL we got doesn't match
+ matched = check_callbacks(willie, trigger, url, False)
+ if matched:
+ continue
+ # Then see if it redirects anywhere
+ new_url = follow_redirects(url)
+ if not new_url:
+ continue
+ # Then see if the final URL matches anything
+ matched = check_callbacks(willie, trigger, new_url, new_url != url)
+ if matched:
+ continue
+ # Finally, actually show the URL
+ title = find_title(url)
+ if title:
+ results.append((title, getTLD(url)))
+ return results
+
+
+def follow_redirects(url):
"""
- This finds the title when provided with a string of a URL.
+ Follow HTTP 3xx redirects, and return the actual URL. Return None if
+ there's a problem.
"""
- uri = url
+ try:
+ connection = urllib2.urlopen(url)
+ url = connection.geturl() or url
+ connection.close()
+ except:
+ return None
+ return url
+
- if not uri and hasattr(self, 'last_seen_uri'):
- uri = self.last_seen_uri.get(origin.sender)
+def check_callbacks(willie, trigger, url, run=True):
+ """
+ Check the given URL against the callbacks list. If it matches, and ``run``
+ is given as ``True``, run the callback function, otherwise pass. Returns
+ ``True`` if the url matched anything in the callbacks list.
+ """
+ # Check if it matches the exclusion list first
+ matched = any(regex.search(url) for regex in willie.memory['url_exclude'])
+ # Then, check if there's anything in the callback list
+ for regex, function in willie.memory['url_callbacks'].iteritems():
+ match = regex.search(url)
+ if match:
+ if run:
+ function(willie, trigger, match)
+ matched = True
+ return matched
- if not re.search('^((https?)|(ftp))://', uri):
- uri = 'http://' + uri
- if "twitter.com" in uri:
- uri = uri.replace('#!', '?_escaped_fragment_=')
+def find_title(url):
+ """Return the title for the given URL."""
+ content = web.get(url)
+ # Some cleanup that I don't really grok, but was in the original, so
+ # we'll keep it (with the compiled regexes made global) for now.
+ content = title_tag_data.sub(r'<\1title>', content)
+ content = quoted_title.sub('', content)
- content = web.get(uri)
- regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
- content = regex.sub(r'<\1title>',content)
- regex = re.compile('[\'"][\'"]', re.IGNORECASE)
- content = regex.sub('',content)
start = content.find('')
- if start == -1: return
- end = content.find('', start)
- if end == -1: return
- content = content[start+7:end]
- content = content.strip('\n').rstrip().lstrip()
- title = content
-
- if len(title) > 200:
- title = title[:200] + '[...]'
-
- def e(m):
- entity = m.group()
+ end = content.find('')
+ if start == -1 or end == -1:
+ return
+ title = content[start + 7:end]
+ title = title.strip()[:200]
+
+ def get_unicode_entity(match):
+ entity = match.group()
if entity.startswith(''):
- cp = int(entity[3:-1],16)
- return unichr(cp).encode('utf-8')
+ cp = int(entity[3:-1], 16)
elif entity.startswith(''):
cp = int(entity[2:-1])
- return unichr(cp).encode('utf-8')
else:
- char = name2codepoint[entity[1:-1]]
- return unichr(char).encode('utf-8')
-
- title = r_entity.sub(e, title)
-
- if title:
- title = uni_decode(title)
- else: title = 'None'
+ cp = name2codepoint[entity[1:-1]]
+ return unichr(cp)
- title = title.replace('\n', '')
- title = title.replace('\r', '')
+ title = r_entity.sub(get_unicode_entity, title)
+ title = uni_decode(title)
- def remove_spaces(x):
- if " " in x:
- x = x.replace(" ", " ")
- return remove_spaces(x)
- else:
- return x
+ title = ' '.join(title.split()) # cleanly remove multiple spaces
- title = remove_spaces (title)
+ # More cryptic regex substitutions. This one looks to be myano's invention.
+ title = re_dcc.sub('', title)
- re_dcc = re.compile(r'(?i)dcc\ssend')
- title = re.sub(re_dcc, '', title)
+ return title or None
- if title:
- return title
-def getTLD (url):
+def getTLD(url):
idx = 7
- if url.startswith('https://'): idx = 8
- elif url.startswith('ftp://'): idx = 6
- u = url[idx:]
- f = u.find('/')
- if f == -1: u = url
- else: u = url[0:idx] + u[0:f]
- return u
-
-def get_results(willie, text):
- a = re.findall(url_finder, text)
- display = [ ]
- for match in a:
- match = match[0]
- if (match.startswith(exclusion_char) or
- any(pattern.findall(match) for pattern in willie.memory['url_exclude'])):
- continue
- url = uni_encode(match)
- url = uni_decode(url)
- url = iriToUri(url)
- try:
- page_title = find_title(url)
- except:
- page_title = None # if it can't access the site fail silently
- display.append([page_title, url])
- return display
-
-def show_title_auto (willie, trigger):
- if trigger.startswith('.title '):
- return
- if len(re.findall("\([\d]+\sfiles\sin\s[\d]+\sdirs\)", trigger)) == 1: return
- try:
- results = get_results(willie, trigger)
- except Exception as e: raise e
- if results is None: return
-
- k = 1
- for r in results:
- if k > 3: break
- k += 1
-
- if r[0] is None:
- continue
- else: r[1] = getTLD(r[1])
- message = '[ %s ] - %s' % (r[0], r[1])
- if message != trigger:
- willie.say(message)
-show_title_auto.rule = '(?u).*((http|https)(://\S+)).*'
-show_title_auto.priority = 'high'
+ if url.startswith('https://'):
+ idx = 8
+ elif url.startswith('ftp://'):
+ idx = 6
+ tld = url[idx:]
+ slash = tld.find('/')
+ if slash != -1:
+ tld = tld[:slash]
+ return tld
-def show_title_demand (willie, trigger):
- """Show the title of a URL"""
- #try:
- results = get_results(trigger)
- #except: return
- if results is None: return
- for r in results:
- if r[0] is None: continue
- r[1] = getTLD(r[1])
- willie.say('[ %s ] - %s' % (r[0], r[1]))
-show_title_demand.commands = ['title']
-show_title_demand.priority = 'high'
+# Functions for international domain name magic
-#Tools formerly in unicode.py
-
def uni_decode(bytes):
try:
text = bytes.decode('utf-8')
@@ -218,12 +260,9 @@ def urlEncodeNonAscii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
-def iriToUri(iri):
+def iri_to_uri(iri):
parts = urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti == 1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)
-
-if __name__ == '__main__':
- print __doc__.strip()