-
Notifications
You must be signed in to change notification settings - Fork 0
/
conserve
executable file
·188 lines (160 loc) · 5.01 KB
/
conserve
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env bash
#
# conserve - mirror http sites for preservation on UNNA using wget
#
# CHANGE LOG:
#
# v0.1 2021-04-18 - Morgan Aldridge <[email protected]>
# Initial version.
# v0.1.1 2021-04-20 - Morgan Aldridge
# Check for missing files in Internet Archive Wayback
# Machine. Check files for links to original host. Tweaks
# to wget params.
#
# MIT LICENSE:
#
# Copyright (c) 2021 Morgan Aldridge
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
function in_array() {
local found=false
local value="$1"
shift
if [ -z "$value" ] ; then $found ; fi
if [ ${#@} -lt 1 ] ; then $found ; fi
for array_value in "$@" ; do
if [ "$value" = "$array_value" ] ; then found=true ; fi
done
$found
}
function get_url_host() {
local url="$1"
local success=false
if [[ "${url}" =~ ^https?://([^/]+) ]] ; then
printf "${BASH_REMATCH[1]}"
success=true
fi
$success
}
function get_url_path() {
local url="$1"
local success=false
if [[ "${url}" =~ ^https?://[^/]+(/[^?]*)$ ]] ; then
printf "${BASH_REMATCH[1]}"
fi
$success
}
function find_missing_files() {
local url="$1"
local line=""
local matches=()
while IFS= read -r line ; do
if [ -n "${line}" ] ; then
matches+=("${line}")
fi
done <<< "$(find "${host}" -iname "$(basename "$(get_url_path "${url}")")")"
if [ "${#matches[@]}" -gt 0 ] ; then
printf " Potentially matching files:\n"
for match in "${matches[@]}" ; do
printf " %s\n" "${match}"
done
fi
}
function get_wayback_machine_url() {
local success=false
local url="$1"
local wayback_url="$(curl -s "http://archive.org/wayback/available?url=${url}" | jq ".archived_snapshots.closest.available")"
if [ "${wayback_url}" != "null" ] && [[ "${wayback_url}" =~ ^"(.+)"$ ]] ; then
printf "${BASH_REMATCH[1]}"
success=true
fi
$success
}
function parse_wget_log_errors() {
local log="$1"
local current_path=""
local line=""
printf "Failed paths:\n\n"
while IFS= read -r line ; do
# detect current file
if [[ "${line}" =~ ^--[0-9]{4}-[0-9]{2}-[0-9]{2}\ [0-9]{2}:[0-9]{2}:[0-9]{2}--\ \ (https?://.+)$ ]] ; then
current_path="${BASH_REMATCH[1]}"
# detect errors
elif [[ "${line}" =~ request\ sent,\ awaiting\ response\.\.\.\ ([0-9]{3}) ]] ; then
if [ "${BASH_REMATCH[1]}" -ge 300 ] ; then
printf "%3i: %s\n" "${BASH_REMATCH[1]}" "${current_path}"
find_missing_files "${current_path}"
local wayback_url="$(get_wayback_machine_url "${current_path}")"
if [ -n "${wayback_url}" ] ; then
printf " Available in the Wayback Machine: %s\n" "${wayback_url}"
fi
fi
fi
done < "${log}"
}
function find_urls_to_update() {
local paths=()
while IFS= read -r line ; do
if [ -n "${line}" ] ; then
if [[ "${line}" =~ ^([^:]+) ]] ; then
if ! in_array "${BASH_REMATCH[1]}" "${paths[@]}" ; then
paths+=("${BASH_REMATCH[1]}")
fi
fi
fi
done <<< "$(grep -iR "//${host}" "${host}")"
printf "The following files may still contain links to %s:\n\n" "${host}"
for file in "${paths[@]}" ; do
printf "%s\n" "${file}"
done
}
# global variables
date="$(date +%Y%m%d-%H%M%S)"
url="$1"
host="$(get_url_host "${url}")"
log="${host}-${date}.log"
# input validation
if [ -z "${url}" ] ; then
printf "Missing URL to mirror!\n"
fi
if [ -z "${host}" ] ; then
printf "Unable to parse host out of URL '%s'!\n" "${url}"
exit 1
fi
# mirror with wget (long options used for readability)
printf "Mirroring '%s' to './%s'...\n" "${url}" "${host}"
LC_CTYPE="en_US.UTF-8" \
wget \
--mirror \
--page-requisites \
--no-parent \
--convert-links \
--tries=30 \
--wait=5 \
--random-wait \
--execute robots=off \
--output-file="${log}" \
"${url}"
if [ $? -gt 0 ] ; then
printf "Mirroring of '%s' encountered some errors! Please review the log in '%s'.\n\n" "${url}" "${log}"
parse_wget_log_errors "${log}"
else
printf "Done.\n\n"
fi
find_urls_to_update