diff --git a/FetchAgent/server.py b/FetchAgent/server.py index 8015d2d7d..67d561730 100644 --- a/FetchAgent/server.py +++ b/FetchAgent/server.py @@ -10,7 +10,7 @@ import os import signal from gevent.server import StreamServer -import FetchAgent2.MessageProcessor +import FetchAgent.MessageProcessor import traceback import logSetup import mprpc @@ -173,7 +173,7 @@ def run(): raise initialize_manager(interface_dict) - amqp_interface = FetchAgent2.MessageProcessor.MessageProcessor(interface_dict) + amqp_interface = FetchAgent.MessageProcessor.MessageProcessor(interface_dict) print("AMQP Interfaces have started. Launching RPC threads.") diff --git a/RawArchiver/Modules/WebComics.py b/RawArchiver/Modules/WebComics.py index 2126452c4..82e69bbf4 100644 --- a/RawArchiver/Modules/WebComics.py +++ b/RawArchiver/Modules/WebComics.py @@ -295,6 +295,12 @@ class WebComicsRawModule(RawArchiver.ModuleBase.RawScraperModuleBase): '/lost-omens?q=forum/lost-omens&q=forum/', '/office-life?q=tags/office-life&q=tags/', '/handbasketd&q=comic/handbasketd&q=comic/', + + '/printthread.php?', + '/forums/newreply.php', + '?do=newreply&', + '/forums/showsinglepost.php', + '&_debug=1&_debug=1&', ] @classmethod diff --git a/app/sub_views/misc_views.py b/app/sub_views/misc_views.py index 5010c29bf..277205b61 100644 --- a/app/sub_views/misc_views.py +++ b/app/sub_views/misc_views.py @@ -11,6 +11,7 @@ from WebMirror import rules import common.global_constants +import RawArchiver.RawActiveModules def getBadWords(ruleset, netloc): @@ -63,19 +64,23 @@ def get_random_url_group(num_items): return ret +def raw_url_filtered(url): + + for module in RawArchiver.RawActiveModules.ACTIVE_MODULES: + if module.cares_about_url(url): + return False + return True + + def get_random_raw_url_group(num_items): dat = g.session.execute('''SELECT url FROM raw_web_pages TABLESAMPLE SYSTEM(:percentage) ORDER BY url;''', {'percentage' : num_items}) dat = list(dat) - ruleset = rules.load_rules(override=True) ret = [] for linkurl, in dat: - nl = urllib.parse.urlparse(linkurl).netloc - - badwords, badcompounds = getBadWords(ruleset, nl) - filtered = isFiltered(linkurl, badwords, badcompounds) + filtered = raw_url_filtered(linkurl) ret.append((linkurl, filtered))