-
Notifications
You must be signed in to change notification settings - Fork 0
/
fievel.java
481 lines (429 loc) · 18.3 KB
/
fievel.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
import javax.net.SocketFactory;
import javax.net.ssl.SSLSocketFactory;
import java.lang.Thread;
import java.lang.Math;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.net.*;
import java.io.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class fievel {
// https://stackoverflow.com/a/5762502
private static final String ANSI_RESET = "\u001B[0m";
private static final String ANSI_BLACK = "\u001B[30m";
private static final String ANSI_RED = "\u001B[31m";
private static final String ANSI_GREEN = "\u001B[32m";
private static final String ANSI_YELLOW = "\u001B[33m";
private static final String ANSI_BLUE = "\u001B[34m";
private static final String ANSI_PURPLE = "\u001B[35m";
private static final String ANSI_CYAN = "\u001B[36m";
private static final String ANSI_WHITE = "\u001B[37m";
public static class DepthPair {
private int depth = -1;
private ArrayList<String> URLs = new ArrayList<>();
DepthPair() {
}
/**
* @param URLs : A list of URLs found at a certain depth.
* @param depth : The depth where the URLs were found.
* @Constructor
*/
DepthPair(ArrayList<String> URLs, int depth) {
this.depth = depth;
this.URLs = URLs;
}
/**
* @param depth : Relative depth of the pair.
*/
void set_depth(int depth) {
this.depth = depth;
}
/**
* @param URLs : URLs retrieved.
*/
public void set_URL(ArrayList<String> URLs) {
this.URLs = URLs;
}
/**
* @return ArrayList
*/
ArrayList<String> get_urls() {
return this.URLs;
}
/**
* @return int
*/
int get_depth() {
return this.depth;
}
/**
* @param url : A single URL to add.
*/
void append_url(String url) {
this.URLs.add(url);
}
/**
* @param urls : An ArrayList of URLs to add.
*/
void append_urls(ArrayList<String> urls) {
this.URLs.addAll(urls);
}
/**
* @param url : Remove a single URL.
*/
public void remove_url(String url) {
this.URLs.remove(url);
}
/**
* Remove all URLs from the DepthPair object.
*/
public void remove_all_urls() {
this.URLs.clear();
}
/**
* @return String
* @Override
*/
public String toString() {
if (this.URLs.size() == 0) {
return ANSI_PURPLE + "Nothing to see here!" + ANSI_RESET;
}
StringBuilder return_string = new StringBuilder();
for (String str : this.URLs) {
return_string.append(str).append("\n");
}
return ANSI_PURPLE + "----- DEPTH: " + this.get_depth() + " -----" +
"\n\n" + ANSI_CYAN + return_string.toString() + ANSI_RESET;
}
}
public static class Crawler {
private ArrayList<DepthPair> url_pairs = new ArrayList<>();
private int max_depth;
private DepthPair root;
private boolean verbose;
private int threads;
/**
* @param max_depth : Maximum depth to crawl.
* @param base_url : The root url to start the crawl.
* @param threads : Number of threads to use.
* @param verbose : Print non-critical exceptions and meta-data.
* @Constructor
*/
Crawler(int max_depth, String base_url, int threads, boolean verbose) {
this.max_depth = max_depth;
ArrayList<String> base = new ArrayList<>();
base.add(base_url);
this.root = new DepthPair(base, 0);
this.threads = threads;
this.verbose = verbose;
}
/**
* Starts the process.
*/
void run_crawler() {
System.out.println(
ANSI_GREEN + "Successfully started the web crawler with "
+ threads + " threads!" + ANSI_RESET);
get_url_pairs(this.root);
}
/**
* This should probably Override toString(), but
* it doesn't.
*
* @return String
*/
String print_visited_urls() {
StringBuilder depth_pairs = new StringBuilder();
for (DepthPair pair : this.url_pairs) {
depth_pairs.append(pair.toString()).append("\n");
}
return depth_pairs.toString();
}
/**
* The worker responsible for scraping URLs.
* This nested class does not rely on access to
* the outer classes attributes.
*/
public class Worker implements Runnable {
final private List<String> urls;
final private ArrayList<DepthPair> global_pairs;
private boolean verbose;
private ArrayList<String> visited_urls = new ArrayList<>();
private DepthPair to_visit = new DepthPair();
/**
* @param urls : URLs for the worker to visit.
* @param global_pairs : Reference to our list of visited urls at all previous depths.
* @param verbose : Passed from the Crawler class, for printing to stdout.
*/
Worker(List<String> urls, ArrayList<DepthPair> global_pairs,
boolean verbose) {
this.urls = urls;
this.global_pairs = global_pairs;
this.verbose = verbose;
}
/**
* This run method contains the core logic of the web crawler.
* Each worker that is spawned receives a subset of URLs to visit.
* <p>
* First, a socket is created, the contents of the page are parsed for
* links, we ensure we haven't already visited a given link, and
* the url is then appended to a list of urls to visit.
* <p>
* We also keep track of all the URLs we visit during this process.
* <p>
* Each worker has it's own objects for storing the two lists mentioned,
* this avoids any race conditions when updating lists. Duplication is
* handled in the parent, where each worker also returns the data it
* gathered during a run after all workers have been joined.
*
* @Override
*/
public void run() {
// We don't care about relative links.
Pattern url_pattern = Pattern.compile("href=[\"'][^#/\"'](.*?)[\"']");
for (String url : this.urls) {
String path, host;
String[] url_value = get_url_components(url);
if ((host = url_value[0]).equals("")) {
// We had a MalformedURL.
continue;
}
if ((path = url_value[1]).equals("")) {
path = "/";
}
SocketFactory socketFactory = SSLSocketFactory.getDefault();
try (Socket socket = socketFactory.createSocket(host, 443)) {
socket.setSoTimeout(5000);
OutputStream output = socket.getOutputStream();
String request = "GET " + path +
" HTTP/1.1\r\nConnection: close\r\nHost: " + host + "\r\n\r\n";
output.write(request.getBytes(StandardCharsets.US_ASCII));
InputStream in = socket.getInputStream();
String response = readAsString(in);
Matcher url_substrings = url_pattern.matcher(response);
while (url_substrings.find()) {
String url_to_add = url_substrings.group().split("[\"']")[1];
if (get_url_components(url_to_add)[0].equals("")) {
continue;
}
AtomicBoolean add = new AtomicBoolean(true);
outer:
for (DepthPair pair : this.global_pairs) {
// Check for redundant links.
for (String u : pair.get_urls()) {
String[] test = get_url_components(url_to_add);
if (u.equals(test[0] + test[1])) {
add.set(false);
break outer;
}
}
}
if (add.get()) {
this.to_visit.append_url(url_to_add);
}
}
} catch (IOException e) {
if (this.verbose) {
System.out.println(
ANSI_YELLOW + "Socket creation failed due to:\n"
+ "\t" + e.toString() + ANSI_GREEN + "\nSkipping..."
+ ANSI_RESET);
}
}
this.visited_urls.add(host + path);
}
}
ArrayList<String> get_urls_to_visit() {
return this.to_visit.get_urls();
}
ArrayList<String> get_visited_urls() {
return this.visited_urls;
}
}
/**
* @param depth_pair : DepthPair Object containing the URLs and depth
* that they were scraped from.
* @return DepthPair : This updates the Worker.url_pairs member, so
* a return method just satisfies the recursion.
*/
private DepthPair get_url_pairs(DepthPair depth_pair) {
// This function is recursive and uses `max_depth`
// as the base case.
if (depth_pair.depth > this.max_depth) {
return depth_pair;
}
if (this.verbose && depth_pair.get_depth() != 0) {
System.out.println("\n" + ANSI_BLUE + "Crawling at a depth of : "
+ depth_pair.get_depth() + ANSI_RESET + "\n");
}
DepthPair to_visit = new DepthPair();
to_visit.set_depth(depth_pair.depth + 1);
ArrayList<String> visited_urls = new ArrayList<>();
ArrayList<String> visit_list = new ArrayList<>();
try {
if (depth_pair.get_urls().size() == 1) {
// We always start with one URL.
Worker worker = new Worker(
depth_pair.get_urls().subList(0, 1),
this.url_pairs,
this.verbose
);
Thread thread = new Thread(worker);
thread.start();
thread.join();
visited_urls.addAll(worker.get_visited_urls());
visit_list.addAll(worker.get_urls_to_visit());
} else {
Worker[] workers = new Worker[this.threads];
Thread[] threads = new Thread[this.threads];
double split = Math.floor(
depth_pair.get_urls().size() / (double) this.threads);
for (int i = 0; i < this.threads; i++) {
// A hack for dealing with a bad division.
double work_split = split * (i + 1);
if (i == this.threads - 1) {
work_split = depth_pair.get_urls().size();
}
workers[i] = new Worker(
depth_pair.get_urls().subList(
((int) split * i), (int) work_split),
this.url_pairs,
this.verbose
);
threads[i] = new Thread(workers[i]);
threads[i].start();
}
// These `for` loops are important to keep distinct
// so that we wait for the work to finish before
// trying to aggregate our results.
for (int i = 0; i < this.threads; i++) {
threads[i].join();
}
for (int i = 0; i < this.threads; i++) {
visited_urls.addAll(workers[i].get_visited_urls());
visit_list.addAll(workers[i].get_urls_to_visit());
}
}
} catch (InterruptedException |
ConcurrentModificationException ex) {
System.out.println(
ANSI_RED + "A worker unexpectedly failed!" + ANSI_RESET);
System.out.println("\t" + ex.toString());
}
// Remove all duplicates before going deeper (cue inception music).
Set<String> set = new HashSet<>(visit_list);
visit_list.clear();
visit_list.addAll(set);
to_visit.append_urls(visit_list);
// This, along with the `AtomicBoolean` should
// ensure that we don't aggregate duplicates.
set.clear();
set.addAll(visited_urls);
visited_urls.clear();
visited_urls.addAll(set);
DepthPair visited = new DepthPair(visited_urls, depth_pair.get_depth());
this.url_pairs.add(visited);
return get_url_pairs(to_visit);
}
/**
* @param inputStream : Input stream from a given socket.
* @return String : A string representation of the byte stream.
* @throws IOException : Handled by the caller.
*/
private static String readAsString(final InputStream inputStream)
throws IOException {
// https://stackoverflow.com/a/45419247
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
final byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) > 0) {
outputStream.write(buffer, 0, bytesRead);
}
return outputStream.toString(StandardCharsets.UTF_8.name());
}
/**
* @param url : The URL to have host and path split.
* @return String[] : The host and path from the passed URL.
*/
private String[] get_url_components(String url) {
// This function assumes that caller passes URLs
// in the from `http[s?]://<host>/[<path>?]`
URL url_obj;
try {
url_obj = new URL(url);
} catch (MalformedURLException ex) {
if (this.verbose) {
System.out.println(ANSI_YELLOW +
"Malformed URL: " + url + "\n\tSkipping..." + ANSI_RESET);
}
return new String[]{"", ""};
}
return new String[]{url_obj.getHost(), url_obj.getPath()};
}
}
public static void main(String[] args) {
String arg;
String url = "https://www.google.com";
int i = 0;
int threads = 4;
int depth = 3;
boolean verbose = false;
if (args.length == 0) {
System.out.println(ANSI_BLUE + "Running in default mode! Type `--help` for options." + ANSI_RESET);
}
// Parse the command line
// TODO: Handle incorrect arg values.
while (i < args.length && args[i].startsWith("--")) {
arg = args[i++];
switch (arg) {
case "--url":
url = args[i++];
break;
case "--depth":
String d = args[i++];
depth = Integer.parseInt(d);
break;
case "--verbose":
verbose = true;
break;
case "--parallel":
if (i < args.length) {
String t = args[i++];
threads = Integer.parseInt(t);
} else {
System.out.println(ANSI_RED + "-parallel requires an integer" + ANSI_RESET);
}
break;
case "--help":
System.out.println(ANSI_BLUE + "\nUsage: " + ANSI_GREEN + "fievel " + ANSI_BLUE +
"[--url url] [--depth D] [--verbose] [--parallel N] [--help]\n\n" +
"--url: The website to be used as the root for the\n" +
" recursive web crawl.\n" + ANSI_PURPLE +
" Default is `https://www.google.com`.\n" + ANSI_BLUE +
"--depth: The cut off depth for the web crawler. The layers\n" +
" of recursion will not exceed this value.\n" + ANSI_PURPLE +
" Default is 3.\n" + ANSI_BLUE +
"--parallel: The number of threads to be ran during a web crawl.\n" + ANSI_PURPLE +
" Default is 4.\n" + ANSI_BLUE +
"--verbose: Print all non-critical error messaging, along with\n" +
" meta-data relating to the job.\n" + ANSI_PURPLE +
" Default is false." + ANSI_RESET);
System.exit(0);
}
}
try {
URL test = new URL(url);
} catch (MalformedURLException ex) {
System.out.println(ANSI_RED + "URL should be in the from 'https://www.<hostname>/<path>'" +
ANSI_RESET);
System.out.println("Exiting...");
System.exit(1);
}
Crawler crawler = new Crawler(depth, url, threads, verbose);
crawler.run_crawler();
System.out.println();
System.out.println(crawler.print_visited_urls());
}
}