Skip to content

Commit

Permalink
Fix the random url sampling system for raw URLs.
Browse files Browse the repository at this point in the history
  • Loading branch information
fake-name committed Dec 27, 2017
1 parent 0dfff83 commit 1f9ec58
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 7 deletions.
4 changes: 2 additions & 2 deletions FetchAgent/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os
import signal
from gevent.server import StreamServer
import FetchAgent2.MessageProcessor
import FetchAgent.MessageProcessor
import traceback
import logSetup
import mprpc
Expand Down Expand Up @@ -173,7 +173,7 @@ def run():
raise

initialize_manager(interface_dict)
amqp_interface = FetchAgent2.MessageProcessor.MessageProcessor(interface_dict)
amqp_interface = FetchAgent.MessageProcessor.MessageProcessor(interface_dict)

print("AMQP Interfaces have started. Launching RPC threads.")

Expand Down
6 changes: 6 additions & 0 deletions RawArchiver/Modules/WebComics.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,12 @@ class WebComicsRawModule(RawArchiver.ModuleBase.RawScraperModuleBase):
'/lost-omens?q=forum/lost-omens&q=forum/',
'/office-life?q=tags/office-life&q=tags/',
'/handbasketd&q=comic/handbasketd&q=comic/',

'/printthread.php?',
'/forums/newreply.php',
'?do=newreply&',
'/forums/showsinglepost.php',
'&_debug=1&_debug=1&',
]

@classmethod
Expand Down
15 changes: 10 additions & 5 deletions app/sub_views/misc_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from WebMirror import rules
import common.global_constants
import RawArchiver.RawActiveModules


def getBadWords(ruleset, netloc):
Expand Down Expand Up @@ -63,19 +64,23 @@ def get_random_url_group(num_items):

return ret

def raw_url_filtered(url):

for module in RawArchiver.RawActiveModules.ACTIVE_MODULES:
if module.cares_about_url(url):
return False
return True



def get_random_raw_url_group(num_items):
dat = g.session.execute('''SELECT url FROM raw_web_pages TABLESAMPLE SYSTEM(:percentage) ORDER BY url;''', {'percentage' : num_items})
dat = list(dat)

ruleset = rules.load_rules(override=True)

ret = []
for linkurl, in dat:
nl = urllib.parse.urlparse(linkurl).netloc

badwords, badcompounds = getBadWords(ruleset, nl)
filtered = isFiltered(linkurl, badwords, badcompounds)
filtered = raw_url_filtered(linkurl)

ret.append((linkurl, filtered))

Expand Down

0 comments on commit 1f9ec58

Please sign in to comment.