-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebrelated.py
352 lines (313 loc) · 11.8 KB
/
webrelated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import os
from PyQt5.QtGui import QPixmap
from datetime import datetime
from SQLiterelated import expandstr
import json
import requests
from constants import *
# from you_get import common
def bili_thumb_link(bvid):
# link = 'https://api.bilibili.com/x/web-interface/view?aid='+avid
link = 'https://api.bilibili.com/x/web-interface/view?bvid={}'.format(bvid)
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
# 'Referer': 'https://www.bilibili.com'}
# linklib3.disable_warnings()
response = requests.get(link, headers=headers, verify=False)
content = json.loads(response.text)
statue_code = content.get('code')
if statue_code == 0:
return content.get('data').get('pic')
else:
return ''
def qqv_thumb_link_old(link):
if 'page' in link:
vid = link.split('page/')[1].split('.html')[0]
thumblink = 'http://puui.qpic.cn/qqvideo_ori/0/{}_360_204/0'.format(vid)
return thumblink
elif 'cover' in link:
# individual video
link_tail = link.split('cover/')[1]
if '/' in link_tail:
vid = link_tail.split('/')[1].split('.html')[0]
else:
fulltext = requests.get(link,headers=headers).text
vid = fulltext.split('var LIST_INFO = {"vid":["')[1].split('"')[0]
thumblink = 'http://puui.qpic.cn/qqvideo_ori/0/{}_360_204/0'.format(vid)
return thumblink
else:
return ''
def qqv_thumb_link(link):
if 'page' in link:
vid = link.split('page/')[1].split('.html')[0]
thumblink = 'https://vpic.video.qq.com/15746141/{}.png'.format(vid)
return thumblink
elif 'cover' in link:
# individual video
link_tail = link.split('cover/')[1]
if '/' in link_tail:
vid = link_tail.split('/')[1].split('.html')[0]
else:
fulltext = requests.get(link,headers=headers).text
vid = fulltext.split('var LIST_INFO = {"vid":["')[1].split('"')[0]
thumblink = 'https://vpic.video.qq.com/15746141/{}.png'.format(vid)
return thumblink
else:
return ''
def iqiyi_thumb_link(link):
# first the the vid
fulltext = requests.get(link,headers=headers).text
tvid = fulltext.split('tvid=')[1].split('&aid')[0]
# get image from api
api_link = 'http://pcw-api.iqiyi.com/video/video/playervideoinfo?tvid={}'.format(tvid)
fulltext = requests.get(api_link,headers=headers).text
content = json.loads(fulltext)
return content.get('data').get('vpic')
# this function return a Qpixmap of video web which has thumbnail
def videothumb(link,linktype, width, height):
if linktype == 'youtube':
# get youtube thumbnail
id = link.split('=')[1].split('&')[0]
thumbnail_link = "http://img.youtube.com/vi/{}/0.jpg".format(id)
rawpixmap = linktopixmap(thumbnail_link)
return resizepixmap(rawpixmap, width, height)
elif linktype == 'bilibili':
# get bilibili thumbnail
bvid = link.split('video/')[1].split('?')[0].split('/')[0]
thumbnail_link = bili_thumb_link(bvid)
rawpixmap = linktopixmap(thumbnail_link)
return resizepixmap(rawpixmap, width, height)
elif linktype == 'iqiyi':
thumbnail_link = iqiyi_thumb_link(link)
rawpixmap = linktopixmap(thumbnail_link)
return resizepixmap(rawpixmap, width, height)
elif linktype == 'tencent':
thumbnail_link = qqv_thumb_link(link)
rawpixmap = linktopixmap(thumbnail_link)
return resizepixmap(rawpixmap, width, height)
else:
pixmap = QPixmap('./pics/unknownthumb.png')
return pixmap.scaled(width,height)
def linktopixmap(link):
data = requests.get(link,headers=headers).content
# load this data into a QPixmap
qpixmap = QPixmap()
qpixmap.loadFromData(data)
return qpixmap
def resizepixmap(qpixmap, width, height, area = 'middle'):
rawwdith = qpixmap.size().width()
rawheight = qpixmap.size().height()
if area == 'top':
# cut the unwanted bottom
qpixmap = qpixmap.copy(0,0,rawwdith, rawwdith/16*9)
elif area == 'middle':
# cut the unwanted top and bottom black strip
qpixmap = qpixmap.copy(0, int(0.5 * (rawheight - rawwdith / 16 * 9)), rawwdith, int(rawwdith / 16 * 9))
else:
# directly scale
pass
# scale the qpixamp to a wanted size
qpixmap = qpixmap.scaled(width, height)
return qpixmap
def refinelink(link, linktype):
if istypeweb(linktype):
if 'http' not in link:
try:
fulltext = requests.get('http://{}'.format(link),headers=headers)
return 'http://{}'.format(link)
except:
return 'https://{}'.format(link)
else:
return link
elif linktype == 'unknown':
newlink = link
return newlink
else:
newlink = link.split('file:///')[1]
return newlink
def savabletitle(title):
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace(':', '')
title = title.replace('?', '')
title = title.replace('"', '')
title = title.replace('|', '')
return title
def getlinktitle(link, linktype, document = None):
if linktype == 'bilibili':
try:
fulltext = requests.get(link,headers=headers).text
title = fulltext.split('<title data-vue-meta="true">')[1].split('_哔哩哔哩 (')[0]
except:
title = 'unknown'
return title
elif istypeweb(linktype):
try:
r = requests.get(link, headers=headers, timeout=1)
r.encoding = 'utf-8'
fulltext = r.text
rawtitle = fulltext.split('<title>')[1].split('</title>')[0]
if rawtitle == '\n' or rawtitle == '':
rawtitle = fulltext.split('title" content="')[1].split('"')[0]
except:
rawtitle = 'unknown'
# remove some usless str. youtube title has some useless suffix
title = rawtitle.split(' - YouTube')[0]
title = title.replace('–', ' ')
return title
elif linktype == 'pdf':
try:
toc = document.get_toc()
rawtitle = document.get_toc()[0][1]
newtitle = rawtitle.replace('', ' ')
if newtitle == 'Title':
return os.path.basename(link).split('.')[0]
else:
return newtitle
except:
newtitle = os.path.basename(link).split('.')[0]
return newtitle
elif linktype == 'unknown':
return 'unknown'
else:
title = os.path.basename(link).split('.')[0]
return title
def titletotags(linktype, rawtitle):
# tidy up this title
tidytitle = tidyup(rawtitle)
# put the linktype at the begining of the tags
tagstr = '{} {}'.format(linktype, tidytitle)
# expand the tags
tagstr = expandstr(tagstr)
# remove repeated str
words = tagstr.split()
tagstr = " ".join(sorted(set(words), key=words.index))
# generate the final taglist
taglist = tagstr.split()
return tagstr, taglist
def getlinktype(link):
if 'youtube.com/watch' in link:
return 'youtube','',''
elif 'bilibili.com/video' in link:
return 'bilibili','',''
elif 'v.qq.com' in link:
return 'tencent','',''
elif 'iqiyi.com/v' in link:
return 'iqiyi','',''
elif 'http' in link:
return linktodoipdf(link)
elif 'www.' in link or '.com' in link:
return 'web', '', ''
elif ':' in link:
extension = link.split('.')[-1]
if extension == 'jpg' or extension == 'png' or extension == 'jpeg':
return 'image','',''
else:
return extension,'',''
else:
return 'unknown','',''
def istypeexist(linktype):
if linktype == 'youtube':
return True
elif linktype == 'bilibili':
return True
elif linktype == 'web':
return True
elif linktype == 'pdf':
return True
elif linktype == 'image':
return True
elif linktype == 'txt':
return True
elif linktype == 'tencent':
return True
elif linktype == 'iqiyi':
return True
elif linktype == 'paper':
return True
else:
return False
def istypeolvid(linktype):
if linktype == 'youtube' or linktype == 'bilibili' or linktype == 'tencent' or linktype == 'iqiyi':
return True
else:
return False
def istypeweb(linktype):
if linktype == 'web' or linktype == 'youtube' or linktype == 'bilibili' or linktype == 'tencent' or linktype == 'iqiyi' or linktype == 'paper':
return True
else:
return False
def tidyup(title=''):
# lowercase first
title = title.lower()
# remove the following str
title = title.replace('[', '')
title = title.replace('【', '')
title = title.replace('】', '')
title = title.replace(']', '')
title = title.replace('{', '')
title = title.replace('}', '')
title = title.replace('(', '')
title = title.replace('(', '')
title = title.replace(')', '')
title = title.replace(')', '')
title = title.replace('!', '')
title = title.replace('+', '')
title = title.replace(':', '')
# replace the following str with space
title = title.replace('|', ' ')
title = title.replace('「', ' ')
title = title.replace('」', ' ')
title = title.replace('/', ' ')
title = title.replace(' - ', ' ')
title = title.replace('-', ' ')
title = title.replace('_', ' ')
title = title.replace('&', ' ')
title = title.replace(' a ', ' ')
title = title.replace(' with ', ' ')
title = title.replace(' to ', ' ')
title = title.replace(' in ', ' ')
title = title.replace(' on ', ' ')
title = title.replace(' at ', ' ')
title = title.replace(' an ', ' ')
title = title.replace(' and ', ' ')
title = title.replace(' the ', ' ')
title = title.replace(' ', ' ')
title = title.replace(' ', ' ')
title = title.replace(' ', ' ')
title = title.replace(' ', ' ')
# if the final str of the title is an empty space, remove it
return title[:-1] if title[-1] == ' ' else title
def now():
now = str(datetime.now())
birthday = int(now.split(' ')[0].replace('-','')+now.split(' ')[1].split('.')[0].replace(':',''))
return birthday
def downloadvideo(link,path='./'):
try:
# common.any_download(link=link,output_dir=path,merge=True)
os.system('you-get -o {} {}'.format(path, link))
except:
pass
def linktodoipdf(link):
try:
r = requests.get('https://sci-hub.se/{}'.format(link), headers=headers, timeout=1)
r.encoding = 'utf-8'
fulltext = r.text
rawdoi = fulltext.split("doi = '")[1].split("';")[0]
doi = 'https://doi.org/{}'.format(rawdoi)
pdflink = fulltext.split('iframe src = "')[1].split('" id')[0]
if 'https' not in pdflink:
pdflink = 'https:{}'.format(pdflink)
return 'paper', doi, pdflink
except:
try:
# open source article
r = requests.get(link, headers=headers, timeout=1)
r.encoding = 'utf-8'
fulltext = r.text
pdflink = fulltext.split('_pdf_url" content="')[1].split('"')[0]
return 'paper', link, pdflink
except:
# non paper website
return 'web','',''