Skip to content

Commit

Permalink
Merge pull request #988
Browse files Browse the repository at this point in the history
  • Loading branch information
Elad Alfassa committed Jan 30, 2016
2 parents f07cc17 + ccda12e commit 4cceef6
Showing 1 changed file with 11 additions and 23 deletions.
34 changes: 11 additions & 23 deletions sopel/modules/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from __future__ import unicode_literals, absolute_import, print_function, division

import re
from contextlib import closing
from sopel import web, tools
from sopel.module import commands, rule, example
from sopel.config.types import ValidatedAttribute, ListAttribute, StaticSection

import requests

url_finder = None
# These are used to clean up the title tag before actually parsing it. Not the
Expand Down Expand Up @@ -150,14 +152,6 @@ def process_urls(bot, trigger, urls):
pass
# First, check that the URL we got doesn't match
matched = check_callbacks(bot, trigger, url, False)
if matched:
continue
# Then see if it redirects anywhere
new_url = follow_redirects(url)
if not new_url:
continue
# Then see if the final URL matches anything
matched = check_callbacks(bot, trigger, new_url, new_url != url)
if matched:
continue
# Finally, actually show the URL
Expand All @@ -167,20 +161,6 @@ def process_urls(bot, trigger, urls):
return results


def follow_redirects(url):
"""
Follow HTTP 3xx redirects, and return the actual URL. Return None if
there's a problem.
"""
try:
connection = web.get_urllib_object(url, 60)
url = connection.geturl() or url
connection.close()
except:
return None
return url


def check_callbacks(bot, trigger, url, run=True):
"""
Check the given URL against the callbacks list. If it matches, and ``run``
Expand All @@ -201,10 +181,18 @@ def check_callbacks(bot, trigger, url, run=True):

def find_title(url):
"""Return the title for the given URL."""
response = requests.get(url, stream=True)
try:
content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes)
content = ''
for byte in response.iter_content(chunk_size=512, decode_unicode=True):

This comment has been minimized.

Copy link
@anarcat

anarcat Jan 30, 2016

why hardcoding that value here? shouldn't we rely on the requests.CHUNK_SIZE instead?

This comment has been minimized.

Copy link
@elad661

elad661 Jan 30, 2016

Contributor

The default for iter_content is one byte, while iter_lines uses 512 bytes. I decided using the default iter_lines uses seemed like a better idea.

This comment has been minimized.

Copy link
@anarcat

anarcat Jan 30, 2016

true, good point

content += str(byte)
if '</title>' in content or len(content) > max_bytes:
break
except UnicodeDecodeError:
return # Fail silently when data can't be decoded
finally:
# need to close the connexion because we have not read all the data
response.close()

# Some cleanup that I don't really grok, but was in the original, so
# we'll keep it (with the compiled regexes made global) for now.
Expand Down

0 comments on commit 4cceef6

Please sign in to comment.