-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_tree.py
executable file
·236 lines (188 loc) · 8.54 KB
/
html_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import re
#from collections import Counter
import explorer
import tags_learner
class node:
def __init__(self,first_index,*attrs):
"""
Args:
first_index (int two size tuple): index of this node's start tag in given html
"""
self.depth=0
self.index=[first_index,(0,0)]
self.list,self.text=[],[]
self.tag,self.parent="",None
self.number,self.lastchild_number=0,0
def check_index(self,html):
print("start tag:",html[self.index[0][0]:self.index[0][1]])
print("end tag:",html[self.index[1][0]:self.index[1][1]])
class tree:
def __init__(self,xml,Text=True,limit=1):
"""
"""
self.xml,self.tags,self.blacklist,self.c=xml,[],tags_learner.get_blacklist(True),0
print("black:",self.blacklist)
#whether you want to get texts of html
self.Text=Text
self.limit=limit#textにlimit以上の文字数のあるnodeを取得する。
self.textnodes=[]#textがlimitを超えたnodeの格納場所
self.c=0
self.text_number=0
def make_tree(self,IE_version='IE 9'):
"""IE_version:条件付きコメントに対する挙動を設定する。\n
IE_version in {'IE 9','IE 8','IE 7','IE 6'}"""
index,self.trees,self.version=(0,0),[],IE_version
while(index[1]!=len(self.xml)):
bool,index,node=self.recursive(skip=index[1])
if(node!=None):
self.trees.append(node)
def recursive(self,skip=0,parent=None):
"""
次タグを見つける操作をrecursiveで行う。\n
次のタグを検知.\n
一番最初には<script>等の要除外タグは来ない前提\n
explorer特有のhtmlの挙動を考慮する必要がある.
"""
thisistag,index_t=self.get_nexttag(skip=skip)
if(thisistag):
self.c+=1
thisnode=self.make_node(index_t,parent)
thisnode.number=self.c
if(thisnode.tag=="script"):
pat=r'</script>'
last=re.search(pat,self.xml[index_t[1]:],flags=re.S).span()
last=(last[0]+index_t[1],last[1]+index_t[1])
index_t,thisnode.index[1]=last,last#当タグの終タグに指定
elif(thisnode.tag[0]=='!'):#コメントの処理
sentence=self.xml[index_t[0]:index_t[1]]
if(re.search(r'\[if.*?\]',sentence,flags=re.S)!=None):#条件付きコメント
if(explorer.html_IE(sentence,self.version)):#非コメントとして読む
thisnode.index[1]=index_t
else:#コメントとして読む
pat=r'<!\[.*?endif.*?\].*?-->'
last=re.search(pat,self.xml[index_t[1]:],flags=re.S).span()
last=(last[0]+index_t[1],last[1]+index_t[1])
index_t,thisnode.index[1]=last,last#当タグの終タグに指定
else:
thisnode.index[1]=index_t
elif(thisnode.tag in self.blacklist):
thisnode.index[1]=index_t
else:#普通の始タグだった場合
check=True
while(check):
index_blast=index_t[1]
check,index_t,child_node=self.recursive(index_t[1],thisnode)
if(child_node!=None):
thisnode.list.append(child_node)
self.text_number += 1
add_text=re.sub(r'\t|\n|\r',"",self.xml[index_blast:child_node.index[0][0]])
if(not add_text in ["",'',' ',None]):
thisnode.text.append([self.text_number+thisnode.depth,add_text])
if(thisnode.list[-1].text!=[]):
thisnode.text[-1][0]=thisnode.list[-1].text[-1][0]+1
if((len(child_node.list)==0)&(thisnode.text!=[])&(child_node.text!=[])):#葉の中のテキスト処理
thisnode.list[-1].text[0][0],thisnode.text[-1][0]=thisnode.text[-1][0],child_node.text[0][0]
thisnode.index[1]=index_t
self.text_number += 1
add_text = re.sub(r'\t|\n|\r',"",self.xml[index_blast:index_t[0]])
if(not add_text in ["",'',' ',None]):
thisnode.text.append([self.text_number+thisnode.depth,add_text])
thisnode.lastchild_number=self.c
if(self.Text):#textの取得
#thisnode.text=re.sub(r'\t|\n|\r',"","".join(thisnode.text))
text=""
for w in thisnode.text:
text+=w[1]
if(len(text.replace(" ",""))>=self.limit):
self.textnodes.append(thisnode)
else:#終タグだった場合
thisnode=None
"""if(thisnode!=None):
print("start :",thisnode.index[0],self.xml[thisnode.index[0][0]:thisnode.index[0][1]])
print("end :",thisnode.index[1],self.xml[thisnode.index[1][0]:thisnode.index[1][1]])
print("\n")"""
return thisistag,index_t,thisnode
def get_nexttag(self,skip):#boolean,taple
"""skip以降最初に現れるタグが終タグか始タグかを判別\n
True if next tag is not end tag otherwise False \n
return boolean and index(integer)
"""
pat=r'<.*?>'
match=re.search(pat,self.xml[skip:],flags=re.S)
if(match==None):
#print("match:",match)
print("skip:",skip)
return False,(skip,len(self.xml))
if(match.group()[1:4]=="!--"):
if((not "[if" in match.group())&(not "endif" in match.group())):
match=re.search(r'<!--.*?-->',self.xml[skip:],flags=re.S)
#print("match:",match.group())
isStart=True
if(match.group()[1]=="/"):
isStart=False
before=match.span()
return isStart,(before[0]+skip,before[1]+skip)#(int s,int l)
def make_node(self,index0,parent_node):
"""indexは<.*?>の場所を指定(括弧もいれること)"""
node_=node(index0)
#parentに指定
node_.parent=parent_node
if(parent_node!=None):
node_.depth=parent_node.depth+1
#タグの中身を分割
pat=r'<(.*?)>'
match=self.xml[index0[0]+1:index0[1]-1].replace("\n","")
sps=match.split(" ")
#attributeを取得
node_.attr=[]
pat_forat=r'(.+?)=.*?\"(.*?)\"'
for w in sps[1:]:
fa=re.findall(pat_forat,w)
if(len(fa)!=0):
node_.attr.append(fa[0])#(w.split("="))
#tagを取得
node_.tag=sps[0]
if(not node_.tag in self.tags):
self.tags.append(node_.tag)
return node_
def search_specifytag(self,*tags,tree_index=0):
"""
指定したタグのついた木を取得する。\n
first:検索する根のindex
"""
quence=[self.trees[tree_index]]
targets=[quence[0]] if(quence[0].tag in tags) else []
while(quence!=[]):
quence_copy=[]
for w in quence:
for v in w.list:
quence_copy.append(v)
if(v.tag in tags):
targets.append(v)
quence=quence_copy
print("tags",tags)
return targets
def search_attrs(self):
pass
def get_text(self,Sort=True):
self.text=[]
for w in self.textnodes:
self.text.extend(w.text)
if(Sort):
self.text=sorted(self.text, key=lambda x:x[0])
def print_text(self):
for w in self.text:
print(w[1])
def conditional_comment():
"""
check whether explorer version satisfy comment's version \n
"""
pass
def sort_index(list_,index_number):
"""
list=[(index(tuple),value),(index(tuple),value)]
index(tuple)=(2,3,4) ,index_number=2 ⇒3
"""
index_list=[]
for element in list_:
index_list.append(element[0][index_number])