This repository has been archived by the owner on Aug 9, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
run.py
79 lines (53 loc) · 2.1 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from crawler import Crawler
from reddit_bot import RedditBot, TaskQueue, CrawTask, CommentBuilder, ReportBuilder
import time
from multiprocessing import Process
import praw
reddit = praw.Reddit('opendirectories-bot',
user_agent='github.com/simon987/opendirectories-bot v1.0 (by /u/Hexahedr_n)')
subreddit = reddit.subreddit("opendirectories")
subs = []
for submission in subreddit.new(limit=3):
subs.append(submission)
bot = RedditBot("crawled.txt")
tq = TaskQueue()
for s in subs:
if not s.is_self:
if not bot.has_crawled(s.id) and not tq.is_queued(s.id):
tq.push(CrawTask(s))
print("id: " + s.id)
print("url: " + str(s.url))
print("title: " + str(s.title))
def execute_task(submission):
try:
if not bot.has_crawled(submission.id):
c = Crawler(submission.url, True)
c.crawl()
c.store_report(submission.id, submission.title)
report_builder = ReportBuilder(c.files, c.base_url)
if report_builder.get_total_size() > 10000000:
com_buider = CommentBuilder(ReportBuilder(c.files, c.base_url), c.base_url, submission.id)
com_string = com_buider.get_comment()
print(com_string)
while True:
try:
if not bot.has_crawled(submission.id):
submission.reply(com_string)
bot.log_crawl(submission.id)
break
except Exception as e:
print("Waiting 10 minutes: " + str(e))
time.sleep(600)
continue
except Exception as e:
print(e)
raise e
while len(tq.tasks) > 0:
task = tq.pop()
if task is not None:
if not bot.has_crawled(task.submission.id):
p = Process(target=execute_task, args={task.submission})
p.start()
print("Started process for " + task.submission.title)
else:
print("Already crawled " + task.submission)