-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikihopper.py
117 lines (92 loc) · 3.25 KB
/
wikihopper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from bs4 import BeautifulSoup
import requests
import queue
from urllib.parse import urljoin, urlsplit, urlunsplit
import os
import argparse
from node import Node
parser = argparse.ArgumentParser(description="Follow wikipedia references to find the closest hop distance between two articles.")
parser.add_argument("target_url")
parser.add_argument("--start_url", help="If not specified, it will default to random")
parser.add_argument("--maxdepth", help="Maximum search depth (default: 3)", type=int)
def get_urls_in_page(url):
'''
Gets, cleans and filters the URLs on the page.
Currently ignores any explicit definition of base_url in the HTML file.
Hardcoded for wiki atm.
'''
all_urls = extract_urls(url)
base_url = get_base_url(url)
complete_urls = [complete_url(u, base_url) for u in all_urls]
filt = urljoin(base_url, "wiki") # TODO: Remove wiki hardcoding...
filtered_urls = list(filter(lambda x: "File:" not in x,
filter(lambda x: filt in x, complete_urls)))
return filtered_urls
def get_base_url(url):
"""Get the base url of page"""
split_url = urlsplit(url)
base_url = urlunsplit((split_url.scheme, split_url.netloc, "", "", ""))
return base_url
def extract_urls(url):
"""Extract all URLs on page and return as list"""
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
urls = soup.find_all("a")
return [url.get("href") for url in urls]
def complete_url(url, base_url):
"""Complete relative URL with base_url"""
return urljoin(base_url, url)
def bfs(start_url, target_url, max_depth):
"""Perform a breadth first search"""
q = queue.Queue()
start_node = Node(start_url, None)
q.put(start_node)
visited = set()
n_nodes = 0
while not q.empty():
print("Checking node {0}".format(n_nodes))
curr_node = q.get()
if curr_node.url in visited:
continue
urls = get_urls_in_page(curr_node.url)
for u in urls:
tmp_node = Node(u,curr_node)
if tmp_node.url == target_url:
print("Found relation at depth {0}".format(tmp_node.depth))
return tmp_node
if tmp_node.depth > max_depth:
print("Could not find relation in {0} redirects or less".format(max_depth))
return
q.put(tmp_node)
n_nodes += 1
visited.add(curr_node.url)
def main():
args = parser.parse_args()
target_url = args.target_url
if args.start_url:
start_url = args.start_url
else:
start = "https://en.wikipedia.org/wiki/Special:Random"
# Get the actual url since Special:Random will redirect
r = requests.get(start)
start_url = r.url
if args.maxdepth:
max_depth = int(args.maxdepth)
else:
max_depth = 3
print("Starting at " + start_url)
solution = bfs(start_url, target_url, max_depth)
# Backtrace the path
trace = []
curr_node = solution
while True:
try:
trace.append(curr_node.url)
except AttributeError:
break
curr_node = curr_node.parent
trace.reverse()
for t in trace:
print(t)
if __name__ == '__main__':
main()