-
Notifications
You must be signed in to change notification settings - Fork 5
/
getsomedata.py
executable file
·79 lines (56 loc) · 2.6 KB
/
getsomedata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from datetime import date
from datetime import timedelta
import re
import urllib.request
class LNTextRetriever:
# premenne
global archiv_url, beggining_tag, ending_tag, debug
archiv_url = r"http://www.lidovky.cz/ln_noviny.asp?c=A", r"_ln_noviny_sko"
#beggining_tag = "<!--FULLTEXTSTART-->"
beggining_tag = r'<div class="text bbtext">'
ending_tag = r"<!--INTEXTSTOP-->"
debug = 1;
def get_article_text(self, article_url, name):
name = str(name) + ".txt"
urllib.request.urlretrieve(article_url, "temp.txt")
f = open("temp.txt", encoding='cp1250')
content = f.read()
b = content.rfind(beggining_tag)
e = content.rfind(ending_tag)
if(debug):
print("URL - " + article_url)
print("Name - " + name)
print("Start - " + str(b) + "\nEnd - " + str(e))
if(b == -1 or e == -1):
print("Arcticle num. " + name + " missing")
return 0
content = content[b:e]
content = re.sub("<[^>]*>", ' ', content) # oreze vsetky tagy z textu
content = re.sub("Pokračování na straně [0-9]+", ' ', content) # oreze vety, ktore zostali z tlacenej podoby novin
content = re.sub("Dokončení ze strany [0-9]+", ' ', content) # oreze vety, ktore zostali z tlacenej podoby novin
content = re.sub("Více čtěte na straně [0-9]+", ' ', content) # oreze vety, ktore zostali z tlacenej podoby novin
content = re.sub(" čtk ", ' ', content) # oreze pripadnu zmienku o CTK na konci clanku
content = re.sub("[\*]+", ' ', content) # oreze hviezdicky, ktore sa obcas v texte vyskytuju
f.close()
out = open(name, 'w')
out.write(content)
out.close()
return content
def get_lndate(self, daydate):
year = str(daydate.year-2000).zfill(2)
month = str(daydate.month).zfill(2)
day = str(daydate.day).zfill(2)
return year + month + day
def get_url(self, daydate, article_num):
article_num = str(article_num).zfill(6)
return archiv_url[0] + self.get_lndate(daydate) + '_' + article_num + archiv_url[1]
retriever = LNTextRetriever()
#print(retriever.get_article_text())
daydate = date.today() - timedelta(1)
#print(retriever.get_url(daydate,11))
for i in range(1, 30):
name = str(i).zfill(3)
name = './sobota/test' + name
retriever.get_article_text(retriever.get_url(daydate,i),name)