-
Notifications
You must be signed in to change notification settings - Fork 2
/
htmlparser.py
44 lines (31 loc) · 962 Bytes
/
htmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from html.parser import HTMLParser
import numpy as np
import pandas as pd
class DistTableHTMLParser(HTMLParser):
def __init__(self, html_file, *args, **kwargs):
super().__init__(*args, **kwargs)
# numerical data
self.data_output = []
# set_ids
self.set_ids = []
self.load_html(html_file)
def load_html(self, file):
with open(file, 'r') as f:
doc = f.read()
self.feed(doc)
def handle_starttag(self, tag, attrs):
self.starttag = tag
def handle_endtag(self, tag):
self.endtag = tag
def handle_data(self, data):
if self.starttag =='span' or self.starttag == 'td':
self.data_output.append(float(data))
elif self.starttag == 'b' and self.endtag != 'b':
if data not in self.set_ids:
self.set_ids.append(data)
else:
pass
def dist_table(self):
n_cols = len(self.set_ids)
data = np.array(self.data_output).reshape((n_cols,n_cols))
return pd.DataFrame(data=data, index=self.set_ids, columns=self.set_ids)