Fix issue grapesmoker#2 by skipping all content until we find a numbe…

…r. Also specify the parser in BeautifulSoup to remove a warning that occurs when running the parser.
alopezlago · Jul 22, 2016 · 2f9ddd1 · 2f9ddd1
1 parent ee39d85
commit 2f9ddd1
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 2 deletions.
diff --git a/Packet.py b/Packet.py
@@ -8,7 +8,7 @@
 import subprocess
 import codecs
 
-from utils import ansregex, bpart_regex, is_answer, is_bpart, get_bonus_part_value, sanitize, conf_gen
+from utils import ansregex, bpart_regex, is_answer, is_bpart, get_bonus_part_value, sanitize, conf_gen, starts_with_number
 
 class InvalidPacket(Exception):
 
@@ -132,6 +132,16 @@ def prepare_html_file(self, html_file, skip_lines=0):
 
         packet_contents = map(lambda x: sanitize(x, valid_tags=['em', 'strong']),
                               packet_contents.split('\n'))
+
+        # Skip all of the packet information at the beginning
+        first_line_with_number_index = 0
+        for line in packet_contents:
+            if starts_with_number(line):
+                break
+            first_line_with_number_index += 1
+
+        packet_contents = packet_contents[first_line_with_number_index:]
+
         packet_contents = [x.strip() for x in packet_contents if sanitize(x).strip() != ''
                            and len(x) > 20
                            and (not re.search('Tossups', x, flags=re.I))

diff --git a/utils.py b/utils.py
@@ -7,6 +7,7 @@
 ansregex = '(?i)a..?wers?:'
 bpart_regex = '^[\s]*\[\d+\]'
 bonus_value_regex = '\[|\]|\(|\)'
+num_regex = '^([\s]*<strong>[\d]+\.[\s]*<\/strong>[\s]*|[\s]*[\d]+\.[\s]*)'
 
 def is_answer(line):
 
@@ -16,13 +17,17 @@ def is_bpart(line):
 
     return re.search(bpart_regex, line) is not None
 
+def starts_with_number(line):
+
+    return re.match(num_regex, line) is not None
+
 def get_bonus_part_value(line):
 
     match = re.search(bpart_regex, line)
     return re.sub(bonus_value_regex, '', match.group(0))
 
 def sanitize (html, valid_tags=[]):
-    soup = BeautifulSoup(html)
+    soup = BeautifulSoup(html, 'html.parser')
     # get rid of comments
     for comment in soup.findAll(
         text=lambda text: isinstance(text, Comment)):