Skip to content

Commit

Permalink
Fix issue grapesmoker#2 by skipping all content until we find a numbe…
Browse files Browse the repository at this point in the history
…r. Also specify the parser in BeautifulSoup to remove a warning that occurs when running the parser.
  • Loading branch information
alopezlago committed Jul 22, 2016
1 parent ee39d85 commit 2f9ddd1
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
12 changes: 11 additions & 1 deletion Packet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import subprocess
import codecs

from utils import ansregex, bpart_regex, is_answer, is_bpart, get_bonus_part_value, sanitize, conf_gen
from utils import ansregex, bpart_regex, is_answer, is_bpart, get_bonus_part_value, sanitize, conf_gen, starts_with_number

class InvalidPacket(Exception):

Expand Down Expand Up @@ -132,6 +132,16 @@ def prepare_html_file(self, html_file, skip_lines=0):

packet_contents = map(lambda x: sanitize(x, valid_tags=['em', 'strong']),
packet_contents.split('\n'))

# Skip all of the packet information at the beginning
first_line_with_number_index = 0
for line in packet_contents:
if starts_with_number(line):
break
first_line_with_number_index += 1

packet_contents = packet_contents[first_line_with_number_index:]

packet_contents = [x.strip() for x in packet_contents if sanitize(x).strip() != ''
and len(x) > 20
and (not re.search('Tossups', x, flags=re.I))
Expand Down
7 changes: 6 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
ansregex = '(?i)a..?wers?:'
bpart_regex = '^[\s]*\[\d+\]'
bonus_value_regex = '\[|\]|\(|\)'
num_regex = '^([\s]*<strong>[\d]+\.[\s]*<\/strong>[\s]*|[\s]*[\d]+\.[\s]*)'

def is_answer(line):

Expand All @@ -16,13 +17,17 @@ def is_bpart(line):

return re.search(bpart_regex, line) is not None

def starts_with_number(line):

return re.match(num_regex, line) is not None

def get_bonus_part_value(line):

match = re.search(bpart_regex, line)
return re.sub(bonus_value_regex, '', match.group(0))

def sanitize (html, valid_tags=[]):
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, 'html.parser')
# get rid of comments
for comment in soup.findAll(
text=lambda text: isinstance(text, Comment)):
Expand Down

0 comments on commit 2f9ddd1

Please sign in to comment.