-
Notifications
You must be signed in to change notification settings - Fork 0
/
hash_url_mongo.py
executable file
·105 lines (76 loc) · 3.84 KB
/
hash_url_mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from flask_script import Manager
from flask import Flask, request, jsonify
from PIL import Image
from flask_pymongo import PyMongo, pymongo
from datetime import datetime
from logging import FileHandler, WARNING
from urllib.request import urlopen
import urllib.request
import io
import imagehash
import json
import certifi
import time
app = Flask(__name__)
manager = Manager(app)
app.config['MONGO_DBNAME'] = 'news'
app.config['MONGO_URI'] = 'mongodb://127.0.0.1:27017/news'
mongo = PyMongo(app)
app.config['MONGO_DBNAME'] = 'online_news'
app.config['MONGO_URI'] = 'mongodb://127.0.0.1:27017/online_news'
mongo_insert = PyMongo(app)
@manager.command
def hash_url():
dict_date = []
fo = open('date.txt', 'r')
doc = fo.readlines()
fo.close()
for date_url in doc:
i = date_url.strip()
dict_date.append(i)
st_date = dict_date[0]
en_date = dict_date[1]
start = datetime.strptime(st_date, '%Y-%m-%d %H:%M:%S')
end = datetime.strptime(en_date, '%Y-%m-%d %H:%M:%S')
process = 0
#Query to collection online_news_result
online_news_result = mongo.db.online_news_result
while True:
try:
hash_mongo = online_news_result.find({"pubDate": {"$gte": start, "$lt": end}}, no_cursor_timeout=True).skip(process)
for i in hash_mongo:
id_image = i['_id']
url_image = i['url']
pubDate = i['pubDate']
if url_image is None:
continue
try:
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 5.1; rv:43.0) Gecko/20100101 Firefox/43.0')]
urllib.request.install_opener(opener)
resp = urllib.request.urlopen(url_image, timeout=10, cafile=certifi.where())
image_file = io.BytesIO(resp.read())
image = Image.open(image_file)
value_hash = imagehash.phash(image, hash_size=8)
img_hash = str(value_hash)
hash_url ={
'_id' : id_image,
'hash' : img_hash,
'pubdate' : pubDate,
'url' : url_image
}
print(hash_url)
news_hash_05_2019 = mongo_insert.db.news_hash_05_2019
insert_data = news_hash_05_2019.insert(hash_url)
except Exception as e:
print(e)
time.sleep(10)
pass
hash_mongo.close()
except CursorNotFound as ec:
print(ec)
time.sleep(10)
pass
print('Data Selesai diproses')
if __name__ == '__main__':
manager.run()