-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_crawler.py
121 lines (94 loc) · 3.9 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import os
import glob
import json
import re
import logging, coloredlogs
from web import Url
from crawler import Crawler
coloredlogs.install()
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger("web_crawler")
# Name of the configuration file used (optional see config.json.example for the options).
CONFIG_FILE = "cfg.json"
# If user's url does not have a protocol, add this one by default.
DEFAULT_PROTOCL = "https://"
# List of parameters that are used for this script
params = [
{"name": "url", "type": "str", "description": "The url to start crawling from"},
{"name": "depth", "type": "int", "description": "The crawling depth"},
]
# The name of the directory to use for storing all the downloaded pages.
webpages_dir = os.path.join(os.getcwd(), "webpages")
def usage():
'''
The usage of the script
'''
def param_to_string(param):
return "\t{name}: {description} (type {type})".format(name=param["name"], description=param["description"], type=param["type"])
usage = "\n" + "=" * 7 + "\n" + "Usage:\n" + "=" * 7 +"\n"
usage += "Description:\n"
usage += "\tThis script, crawls the web for a given URL and given depth, and returns a TSF with the rank results\n"
usage += "Params:\n"
usage += "\n".join([param_to_string(param) for param in params]) + "\n"
usage += "Run:\n"
usage += "\tpython web_crawler.py <url> <depth>\n"
logger.critical(usage)
def parse_and_validate_args():
'''
Parse and validate the users arguments for the script
'''
if len(sys.argv) -1 != len(params):
logger.error("Supplied {supplied} of args instead of {should_of_supplied}".format(supplied=str(len(sys.argv)), should_of_supplied=str(len(params))))
usage()
exit(-1)
url = sys.argv[1]
depth = sys.argv[2]
# Make sure there is a protocol on the url
if not url.startswith("https://") and not url.startswith("http://"):
url = DEFAULT_PROTOCL + url
# Make sure the first link is valid
if Url.download(url) is None:
logger.error("provided url is not valid - {url}".format(url=url))
usage()
exit(-1)
# Make sure the depth is positive
if int(depth) < 0:
logger.error("provided depth ({depth}) must be positive (0 and up)".format(depth=depth))
usage()
exit(-1)
return {"url": url, "depth": depth}
def setup_env():
'''
Check the user configuration file and setup the environment
Currently the configuration file contains the following options:
webpage_dir: This is the webpage directory where the script will store all html pages downloaded
'''
global webpages_dir
# Open the user config file
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE) as f:
cfg = json.loads(f.read())
# Set the configuration for the directory were all the webpages will be stored
# If user did configure a directory, and only if it exists, use it. If not use the default.
if "webpages_dir" in cfg and os.path.isdir(cfg["webpages_dir"]):
webpages_dir = cfg["webpages_dir"]
else:
if "webpages_dir" in cfg:
logger.warning("Custom path for webpages_dir (%s) does not exist - using default (%s)" % (cfg["webpages_dir"], webpages_dir))
os.makedirs(webpages_dir, exist_ok=True)
# Clean the webpage directory of old files
files = glob.glob(os.path.join(webpages_dir, "*"))
for f in files:
os.remove(f)
logger.debug("Config:\n" + "=" * 10)
logger.debug("WebPage Dir: " + webpages_dir)
logger.debug("=" * 10)
def main(args):
crawler = Crawler(url=args["url"], depth=args["depth"], webpages_dir=webpages_dir)
crawler.crawl()
crawler.get_results()
if __name__ == "__main__":
setup_env()
args = parse_and_validate_args()
main(args)