This repository has been archived by the owner on Dec 10, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
crawler.py
executable file
·175 lines (153 loc) · 4.66 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#WEB Crawler
#
# Functions required for crawling the web
#
# Function that get requested url from the Internet
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ""
# Help function that return first link and end position of the link
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
# Help function union two arrays
def union(p,q):
for e in q:
if e not in p:
p.append(e)
# Function that extracts all links from requested page
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
# Initial function that starts the web crawling on a particular URL
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = {}
graph = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index,page,content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
#
# Functions required for indexing, crawled pages
#
# Function that add word to the index
def add_to_index(index,keyword,url):
if keyword in index:
index[keyword].append(url)
else:
# not found, add new keyword to index
index[keyword] = [url]
# Help function that search the index for the keyword
def lookup(index,keyword):
if keyword in index:
return index[keyword]
else:
return None
# Function that splits page into words and adds them to index
def add_page_to_index(index,url,content):
words = content.split()
for keyword in words:
add_to_index(index,keyword,url)
# Function for recording cliks on a particular link
def record_user_click(index,keyword,url):
urls = lookup(index, keyword)
if urls:
for entry in urls:
if entry[0] == url:
entry[1] += 1
#
# Functions required for ranking
#
# Function that computes page ranks
#
# Formula to count rank:
#
# rank(page, 0) = 1/npages
# rank(page, t) = (1-d)/npages
# + sum (d * rank(p, t - 1) / number of outlinks from p)
# over all pages p that link to this page
#
def compute_ranks(graph):
d = 0.8 #dumping constant
numloops = 40
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
#Loop throught all pages
for node in graph:
#check if node links to page
if page in graph[node]:
#Add to new rank based on this node
newrank += d * ranks[node] / len(graph[node])
newranks[page] = newrank
ranks = newranks
return ranks
# Function that returns the one URL most likely to be the best site for that
# keyword. If the keyword does not appear in the index return None
def lucky_search(index, ranks, keyword):
return_url = '';
if keyword not in index:
return None
for url in index[keyword]:
if url in ranks:
if return_url != '':
if ranks[url] > ranks[return_url]:
return_url = url
else:
return_url = url
return return_url
# Function that returns the list of all URLs for that keyword. Ordered by page
# rank. If the keyword does not appear in the index return None
def ordered_search(index, ranks, keyword):
pages = lookup(index, keyword)
return quick_sort(pages, ranks)
# Help function that uses quick sort algorithm to order array
def quick_sort(pages, ranks):
if not pages or len(pages) <= 1:
return pages
else:
pivot = ranks[pages[0]] #find pivot
worse = []
better = []
for page in pages[1:]:
if ranks[page] <= pivot:
worse.append(page)
else:
better.append(page)
return quick_sort(better, ranks) + [pages[0]] + quick_sort(worse, ranks)
index, graph = crawl_web('http://www.udacity.com/cs101x/urank/index.html');
#print "\n Index: \n"
#print index
#print "\n Graph: \n"
#print graph
ranks = compute_ranks(graph)
#print ranks
print ordered_search(index, ranks, 'Hummus')