-
Notifications
You must be signed in to change notification settings - Fork 1
/
spiderProcessQueue.php
executable file
·190 lines (154 loc) · 10.6 KB
/
spiderProcessQueue.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
<?php
/**
* This is a basic tool that scans an existing archive and tries to fill in any gaps. It can be used in a download gets partially corrupted, or your Heritrix job gets corrupted.
* It even detects a small number of things Heritrix fails to, e.g. <img srcset>. I use it even on completed Heritrix jobs to ensure a full archive.
* Additionally, it will clean up the directory structure a bit. MirrorReader handles the 1-suffixed directories fairly well (albeit at the cost of speed), but it does not handle 1-suffixed files, which this does deal with.
* Also note that it will write 301 redirects where they are expected by the including files, unless the 301 is specified in config.php. This is good for archive viewing, but does mean you'll end up with duplicated files. (I find it worthwhile, in any case, to have them located in both places.)
* Currently searches <a href>, <img src>, and <img srcset>.
* Does not redownload existing files.
* Does not check content type, but will only download files matching valid string/regex rules.
*/
// For now, report all errors.
error_reporting(E_ALL);
ini_set('display_errors', 'On');
ini_set('display_startup_errors', 'On');
// Allow Unlimited Execution Time
set_time_limit(0);
// Format as text output
header('Content-Type: text/plain');
// Require Configuration Files
require(__DIR__ . '/vendor/autoload.php');
require('config.php');
// Let's not actually write anything yet
$trial = false;
// Disable the script hacks by default, since they are liable to include too many files in our scan.
\MirrorReader\Processor::$domainConfiguration['default']['scriptHacks'] = [];
// Get $_GETs
$resource = $_GET['resource'];
$protocol = $_GET['protocol'] ?? 'http';
$match = $_GET['match'] ?? '.*';
$path = realpath(\MirrorReader\Processor::$store . $resource);
$spider = new \MirrorReader\Spider($resource, $match);
while ($message = \MirrorReader\Queue::getConsumer($resource)->receiveNoWait()) {
$srcFile = \MirrorReader\Factory::get($message->getBody());
$destFile = $srcFile->getFileStore();
if (!$destFile) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('Skipping file, dest root directory doesn\'t exist', [$srcFile, $destFile]);
}
elseif (strlen(basename($destFile)) > 254) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('Skipping file, destination base name too long', [$srcFile, $destFile]);
}
elseif (!is_dir(dirname($destFile)) && !\MirrorReader\MkdirIndex::execute(dirname($destFile))) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->error('Skipping file, couldn\'t create destination directory', [$srcFile, $destFile]);
}
elseif ($srcFile->fileStore301less != $srcFile->getFileStore()) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('Skipping file, appears to have 301 path -- run fileNameFixer.php first', [$srcFile, $destFile]);
}
else {
$client = new \GuzzleHttp\Client([
'base_uri' => $protocol . '://' . $resource,
'http_errors' => false,
// When all is said and done, we want to write the effective URL, not the original request (which should be handled by the redirects)
'on_stats' => function(\GuzzleHttp\TransferStats $stats) use (&$effectiveUrl, &$transferTime) {
$effectiveUrl = $stats->getEffectiveUri();
$transferTime = $stats->getTransferTime();
},
'allow_redirects' => [
// A big part of this is that, unlike Heritrix MirrorWriter, we will record all redirects and write correct files for them.
'on_redirect' => function(
\Psr\Http\Message\RequestInterface $request,
\Psr\Http\Message\ResponseInterface $response,
\Psr\Http\Message\UriInterface $uri
) use ($trial, $resource) {
$sourceObject = \MirrorReader\Factory::get((string) $request->getUri());
$responseObject = \MirrorReader\Factory::get((string) $uri);
if ($responseObject->getFileStore() == $sourceObject->getFileStore()) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('Redirect file points to itself (maybe HTTPS?)', [$request->getUri(), $uri]);
}
elseif (file_exists($sourceObject->getFileStore())) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('File already exists at redirect location', [$sourceObject->getFileStore(), $uri]);
}
else {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->info('Writing internal redirect file...', [$sourceObject->getFileStore(), $uri]);
if (!$trial) {
if (!is_dir(dirname($sourceObject->getFileStore())) && !mkdir($sourceObject->getFileStore())) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->error('Failed to create directory for internal redirect file...', [$sourceObject->getFileStore(), $uri]);
} elseif (file_put_contents($sourceObject->getFileStore(),
'<!-- MirrorReader Redirect Page --><html><head>'
. '<title>Internal Redirect</title><meta http-equiv="refresh" content="0; url=' . htmlspecialchars((string) $uri) . '">'
. '</head><body>'
. '<center><a href="' . htmlspecialchars((string) $uri) . '">Follow redirect.</a></center>'
. '</body></html>'
)) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->notice('Wrote internal redirect file...', [$sourceObject->getFileStore(), $uri]);
} else {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->error('Failed to write internal redirect file...', [$sourceObject->getFileStore(), $uri]);
}
}
}
}
]
]);
$client->head($srcFile->getFile());
$effectiveUrlObject = \MirrorReader\Factory::get((string) $effectiveUrl);
if (file_exists($effectiveUrlObject->getFileStore())) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->notice('Effective URL already exists', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
}
else {
$response = $client->get($srcFile->getFile());
\MirrorReader\Logger::getLogger("Spider-{$resource}")->info('Downloaded file', [
'source' => $srcFile->getFile(),
'effective' => $effectiveUrlObject->getFile(),
'time' => $transferTime,
'status' => $response->getStatusCode(),
'headers' => $response->getHeaders()
]);
if ($response->getStatusCode() !== 200) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('File does not return 200', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore(), $response]);
if ($response->getStatusCode() >= 400) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->warn('File with > 400 status code was added to Redis block list', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore(), $response]);
\MirrorReader\RedisInstance::get()->sAdd("Spider-{$resource}-errors", $srcFile->getFile());
}
}
else {
if (
!is_file($effectiveUrlObject->getFileStore())
&& is_dir($effectiveUrlObject->getFileStore())
&& !file_exists($effectiveUrlObject->getFileStore() . "/index.html")
) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->info('Writing new file (as index.html)', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
if (!$trial) {
if (file_put_contents($effectiveUrlObject->getFileStore() . "/index.html", $response->getBody()->getContents())) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->notice('Wrote new file (as index.html)', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
} else {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->error('Failed to write new file (as index.html)', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
}
}
}
else {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->info('Writing new file', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
if (!$trial) {
if (file_put_contents($effectiveUrlObject->getFileStore(), $response->getBody()->getContents())) {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->notice('Wrote new file', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
} else {
\MirrorReader\Logger::getLogger("Spider-{$resource}")->error('Failed to write new file', [$srcFile->getFile(), $effectiveUrlObject->getFile(), $effectiveUrlObject->getFileStore()]);
}
}
}
if (!$trial) {
usleep(50000);
// Now process the newly-obtained file for outlinks
// Tell the Resource Object to invoke the processFile function whenever it encounters a URL.
$effectiveUrlObject = \MirrorReader\Factory::get((string) $effectiveUrl); // recreate since the file should now exist
//var_dump($effectiveUrlObject);
$effectiveUrlObject->formatUrlCallback = [$spider, 'processFile'];
$effectiveUrlObject->getContents();
}
}
}
}
// Remove the message from the queue
\MirrorReader\Queue::getConsumer($resource)->acknowledge($message);
usleep(500000);
flush();
}