Skip to content

Commit

Permalink
fix: [crawler] filter lookup parent + domain daterange
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Oct 7, 2024
1 parent c8b1c67 commit 83e1108
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 3 deletions.
7 changes: 6 additions & 1 deletion bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(self):
self.items_dir = None
self.original_domain = None
self.domain = None
self.parent = None

# TODO Replace with warning list ???
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
Expand Down Expand Up @@ -243,6 +244,7 @@ def compute(self, capture):
return None

self.domain = Domain(domain)
self.parent = self.domain.get_parent()
self.original_domain = Domain(domain)

epoch = int(time.time())
Expand All @@ -263,14 +265,17 @@ def compute(self, capture):
# Save Capture
self.save_capture_response(parent_id, entries)

self.domain.update_daterange(self.date.replace('/', ''))
if self.parent != 'lookup':
# Update domain first/last seen
self.domain.update_daterange(self.date.replace('/', ''))
# Origin + History + tags
if self.root_item:
self.domain.set_last_origin(parent_id)
self.domain.update_vanity_cluster()
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
# Crawler stats
self.domain.add_history(epoch, root_item=self.root_item)

if self.domain != self.original_domain:
Expand Down
2 changes: 1 addition & 1 deletion bin/lib/ail_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def get_default_correlation_objects():
return AIL_OBJECTS_CORRELATIONS_DEFAULT

def get_obj_queued():
return ['item', 'image', 'message', 'ocr', 'qrcode']
return ['item', 'image', 'message', 'ocr', 'qrcode'] # screenshot ???

def get_objects_tracked():
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title']
Expand Down
3 changes: 3 additions & 0 deletions bin/lib/ail_queues.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ def get_processed_end_objs():
def get_processed_end_obj():
return r_obj_process.spop(f'objs:processed')

def is_obj_in_process(obj_gid):
return r_obj_process.sismember(f'objs:process', obj_gid)

def get_processed_objs_by_type(obj_type):
return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1)

Expand Down
7 changes: 6 additions & 1 deletion bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
# is safe ???
# TODO FILTER URL ???

def api_get_onion_lookup(domain):
def api_get_onion_lookup(domain): # TODO check if object process done ???
domain = domain.lower()
url_unpack = unpack_url(domain)
domain = url_unpack['domain']
Expand All @@ -78,6 +78,11 @@ def api_get_onion_lookup(domain):
if is_crawler_activated():
create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT)
return {'error': 'domain not found', 'domain': domain}, 404
if not dom.was_up():
return {'error': 'domain not found', 'domain': domain}, 404
# else
## TODO check if object process done -> return result if more than one history
# #-> check item history
meta = dom.get_meta(options={'languages'})
meta['first_seen'] = meta['first_seen'].replace('/', '-')
meta['last_seen'] = meta['last_check'].replace('/', '-')
Expand Down
10 changes: 10 additions & 0 deletions bin/lib/objects/abstract_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# Import Project packages
##################################
from lib import ail_logger
from lib.ail_queues import is_obj_in_process
from lib import Tag
from lib.ConfigLoader import ConfigLoader
from lib import Duplicate
Expand Down Expand Up @@ -92,6 +93,15 @@ def _set_field(self, field, value):
else:
return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value)

## Queues ##

# is_in_queue , is_in_module

def is_being_processed(self):
return is_obj_in_process(self.get_global_id())

# -Queues- #

## Tags ##
def get_tags(self, r_list=False):
tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True))
Expand Down

0 comments on commit 83e1108

Please sign in to comment.