-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQueryConfig.py
executable file
·103 lines (82 loc) · 3.94 KB
/
QueryConfig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/python3
'''
File: QueryConfig.py
Project: TBotLeecher2
File Created: Sunday, 27th October 2019 11:50:27 pm
Author: Erik Regla ([email protected])
-----
Last Modified: Monday, 28th October 2019 2:07:00 am
Modified By: Erik Regla ([email protected])
-----
Licensed under GPLv3, 2019 Erik Regla
'''
import yaml
import tweepy
class QueryConfig(object):
"""
Configurations for queries used on crawling.
Configurations are developed on a way that all attributes are concatenated
by an "and" operator. In this way, you can develop custom and specific crawlers
dedicated to grab all information from a single source of parameters.
Given this, it's expected that each element to be concatenated is to be developed on
a diffent file unless stated other in the parameter file
"""
def __init__(self, config_file_path:dict, tweepy_api:tweepy.API):
try:
query_file_stream = open(config_file_path, 'r')
query_file = yaml.load(query_file_stream, Loader=yaml.FullLoader)
self.tweepy_api = tweepy_api
parameters = query_file["parameters"]
self.download_media = parameters["download_media"]
self.query_type = query_file["type"]
if self.query_type == "streaming":
self.track = parameters["track"]
self.locations = parameters["locations"]
if self.query_type == "query":
query_terms = query_file["query"]
self.track = config_file["database"]
# build parameters
self.language = parameters["language"]
self.locale = parameters["locale"]
self.since_id = parameters["since_id"]
self.geocode = parameters["geocode"]
self.show_user = parameters["show_user"]
self.parameter_operator = parameters["parameter_operator"]
self.query_operator = parameters["query_operator"]
self.page_size = parameters["page_size"]
self.stride = parameters["stride"]
self.hashtags_query = self.build_query_string(query_terms["hashtags"], self.query_operator)
self.terms_query = self.build_query_string(query_terms["terms"], self.query_operator)
self.countries_query = self.build_country_parameters(query_terms["countries"], "place", self.query_operator)
self.query = self.join_queries([self.hashtags_query, self.terms_query, self.countries_query], self.parameter_operator)
except yaml.YAMLError as exc:
print("Error in configuration file:", exc)
def join_queries(self, queries_array: list, operator):
_queries_array = filter(lambda x: x != '', queries_array)
if queries_array:
return self.build_query_string(_queries_array, operator)
else:
return "*"
def evenly_space_operator(self, operator:str):
return " " + operator.strip() + " "
def build_query_string(self, array:list, operator:str = "OR"):
"Concatenates the parameters in a single query"
if not array:
return ""
return self.evenly_space_operator(operator).join(array)
def build_country_parameters(self, array:list, parameter:str, operator:str = "OR"):
"Concatenates the parameters in a single query"
if not array:
return ""
_ids_array = []
for x in array:
places = self.tweepy_api.geo_search(query=x, granularity="country")
place_id = places[0].id
_ids_array.append(place_id)
return self.evenly_space_operator(operator).join([parameter + ":" + x for x in _ids_array])
def test_variable_or_default(self, variable:str, file:dict, default=None):
if file[variable] is None:
return default
return file[variable]
def __str__(self):
return str(self.__class__) + ": " + str(self.__dict__)